Project Business Economic and Financial Data

Sales of DimSum Records, Asian-food restaurant in Medellin, Colombia

2024/2025

Authors: Daniel Gutierrez & Fabio Pimentel

# Required Packages--------------------
rm(list = ls())
library(readxl)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(corrplot)
## corrplot 0.95 loaded
library(feasts)
## Loading required package: fabletools
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyr)
library(ggthemes)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(DIMORA)
## Loading required package: minpack.lm
## Loading required package: numDeriv
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Loading required package: deSolve
library(tseries)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
## 
##     index
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang

Import Data

#setwd('/Users/fabiopimentel/Documents/Padua/clases/segundo año primer semestre/BEF data/proyecto/time_series_padova-main')

# target variable
sales <- read_excel("data/sales/sales_dimsum_31102024.xlsx")

sales[is.na(sales)] <- 0 # set to zero na values

Creating variables

# economic variables
eco_growth <- read_excel("data/macroeconomic/economic_activity.xlsx")
fx <- read_excel("data/macroeconomic/fx.xlsx") #Foreign exchange is the conversion of one currency into another
inflation <- read_excel("data/macroeconomic/inflation.xlsx")
unemployment <- read_excel("data/macroeconomic/unemployment.xlsx")
# other variables
google_trends <- read_excel("data/other/google_trends_restaurantes.xlsx")
rain <- read_excel("data/other/rain_proxy.xlsx")
temp <- read_excel("data/other/temperature_data.xlsx")
temp[is.na(temp)] <- 0
rain[is.na(rain)] <- 0
plot(temp$tavg) # no zeros in temp : OK

plot(temp$tmedian) # no zeros in temp : OK- looks better than mean

Explore data structure

str(sales)
## tibble [1,095 × 4] (S3: tbl_df/tbl/data.frame)
##  $ date     : POSIXct[1:1095], format: "2021-11-01" "2021-11-02" ...
##  $ sales_cop: num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
##  $ bar      : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
##  $ food     : num [1:1095] 0 0 0 0 0 0 0 0 0 0 ...
str(eco_growth)
## tibble [238 × 2] (S3: tbl_df/tbl/data.frame)
##  $ date        : POSIXct[1:238], format: "2005-01-01" "2005-02-01" ...
##  $ ise_original: num [1:238] 59.8 61.5 62.2 63 62.7 ...
str(fx) # Foreign exchange is the conversion of one currency into another
## tibble [12,020 × 2] (S3: tbl_df/tbl/data.frame)
##  $ date: POSIXct[1:12020], format: "1991-11-27" "1991-11-28" ...
##  $ fx  : num [1:12020] 693 694 695 695 643 ...
str(inflation)
## tibble [832 × 2] (S3: tbl_df/tbl/data.frame)
##  $ date     : POSIXct[1:832], format: "1955-07-01" "1955-08-01" ...
##  $ inflation: num [1:832] -0.0087 -0.0001 0.0084 0.0077 0.0144 0.0203 0.021 0.0257 0.031 0.0304 ...
str(unemployment)
## tibble [286 × 2] (S3: tbl_df/tbl/data.frame)
##  $ date        : POSIXct[1:286], format: "2001-01-01" "2001-02-01" ...
##  $ unemployment: num [1:286] 16.6 17.4 15.8 14.5 14 ...
str(google_trends)
## tibble [262 × 2] (S3: tbl_df/tbl/data.frame)
##  $ date         : POSIXct[1:262], format: "2019-11-10" "2019-11-17" ...
##  $ google_trends: num [1:262] 64 60 63 65 69 63 80 98 75 63 ...
str(rain)
## tibble [56,648 × 6] (S3: tbl_df/tbl/data.frame)
##  $ date             : POSIXct[1:56648], format: "2021-01-01" "2021-01-01" ...
##  $ region           : chr [1:56648] "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" "ANTIOQUIA" ...
##  $ river            : chr [1:56648] "A. SAN LORENZO" "CARLOS LLERAS" "CONCEPCION" "DESV. EEPPM (NEC,PAJ,DOL)" ...
##  $ contribution_m3s : num [1:56648] 19.29 20.31 3.61 7.48 19.5 ...
##  $ contrribution_kWh: num [1:56648] 4354800 3143300 1011700 2096000 899700 ...
##  $ contribution_pct : num [1:56648] 0.798 0.79 0.712 0.944 0.638 ...
str(temp) # this has NaNs, must fill somehow
## tibble [1,113 × 4] (S3: tbl_df/tbl/data.frame)
##  $ date   : POSIXct[1:1113], format: "2021-10-31" "2021-11-01" ...
##  $ tavg   : num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...
##  $ prcp   : num [1:1113] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tmedian: num [1:1113] 19.8 18.3 18.4 18.3 18.6 19.8 19.9 19.7 18.8 18.7 ...

Sales

# create time variables
plot(sales$sales_cop)

Sales Monthly

# sales
## sales monthly
df_sales_m <- sales %>%
  mutate(month = floor_date(date, "month")) %>% # Extract month
  group_by(month) %>%
  summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
            )     # Summing values

head(df_sales_m)

Sales Weekly

## sales weekly
df_sales_w <- sales %>%
  mutate(week = floor_date(date, "week")) %>% # Extract month
  group_by(week) %>%
  summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food))     # Summing values

head(df_sales_w)

FX

# fx
df_fx_m <- fx %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(fx_m = mean(fx))

df_fx_w <- fx %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week) %>%
  summarise(fx_w = mean(fx))

head(df_fx_m)
head(df_fx_w)

Rain

## rain
df_rain_g = rain %>%
  group_by(date, region) %>%
  summarise(rain_sum=sum(contribution_m3s))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
df_rain_g  <- df_rain_g[df_rain_g$region=="ANTIOQUIA",]

head(df_rain_g)
# montly
df_rain_m <- df_rain_g %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(rain_m = sum(rain_sum))


# weekly
df_rain_w <- df_rain_g %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week) %>%
  summarise(rain_w = sum(rain_sum))

head(df_rain_m)
head(df_rain_w)

Temperature

# temperature
# montly
df_temp_m <- temp %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(temp_m = mean(tavg), prcp_m = sum(prcp))


# weekly
df_temp_w <- temp %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week) %>%
  summarise(temp_w = mean(tavg), prcp_w = sum(prcp))

head(df_temp_m)
head(df_temp_w)

Merging Data Frames

Daily Data

## daily data----------
#sales, rain, fx are the only ones daily
df_merged_d <- merge(sales, df_rain_g, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, fx, by = "date", all = FALSE) # Inner join
df_merged_d <- merge(df_merged_d, temp, by = "date", all = FALSE) # Inner join

head(df_merged_d)

Weekly Data

### weekly data----------
df_merged_w <- merge(df_sales_w, df_rain_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_google_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_fx_w, by="week", all=F)
df_merged_w <- merge(df_merged_w, df_temp_w, by="week", all=F)

head(df_merged_w)

Monthly Data

### monthly data----------
# change colnames
names(eco_growth) <- c("month", "ise")
names(inflation) <- c("month", "inflation")
names(unemployment) <- c("month", "unemployment") 

df_merged_m <- merge(df_sales_m, df_rain_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_fx_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_google_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, eco_growth, by="month", all=F) # only has until aug 2024
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, inflation, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, unemployment, by="month", all=F)
nrow(df_merged_m)
## [1] 36
df_merged_m <- merge(df_merged_m, df_temp_m, by="month", all=F)
nrow(df_merged_m)
## [1] 36

EDA

objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))

Daily Sales

# sales daily
ggplot(
  df_merged_d, 
  aes(x=date, y=sales_cop)
  ) + geom_line() + ggtitle("Daily Sales of Restaurant")

Weekly sales

# sales weekly
ggplot(df_merged_w, aes(x=week, y=sales_w)) +
  geom_line() + ggtitle("Weekly Sales of Restaurant")

Monthly sales

# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
  geom_line() + ggtitle("Monthly Sales of Restaurant")

Stacked bar plots

We want to move to a stacked bar chart when we care about the relative decomposition of each primary bar based on the levels of a second categorical variable. Each bar is now comprised of a number of sub-bars, each one corresponding with a level of a secondary categorical variable. The total length of each stacked bar is the same as before, but now we can see how the secondary groups contributed to that total.

One important consideration in building a stacked bar chart is to decide which of the two categorical variables will be the primary variable (dictating major axis positions and overall bar lengths) and which will be the secondary (dictating how each primary bar will be subdivided). The most ‘important’ variable should be the primary; use domain knowledge and the specific type of categorical variables to make a decision on how to assign your categorical variables

#Monthly
# Reshape the data to a long format
df_sales_m_long <- df_merged_m %>%
  pivot_longer(cols = c(bar_m, food_m), names_to = "Category", values_to = "Value")

# Create the stacked bar plot
ggplot(df_sales_m_long, aes(x = month, y = Value, fill = Category)) +
  geom_bar(stat = "identity", position = "stack") +
  ggtitle("Monthly Sales of Restaurant") +
  labs(y = "Sales", x = "Month", fill = "Category") +
  theme_minimal()

# Weekly
# Reshape the data to a long format
df_sales_w_long <- df_merged_w %>%
  pivot_longer(cols = c(bar_w, food_w), names_to = "Category", values_to = "Value")

# Create the stacked bar plot
ggplot(df_sales_w_long, aes(x = week, y = Value, fill = Category)) +
  geom_bar(stat = "identity", position = "stack") +
  ggtitle("Weekly Sales of Restaurant") +
  labs(y = "Sales", x = "Week", fill = "Category") +
  theme_minimal()

Seasonal plots

# Seasonal plots
df_sales_w_filtered <- df_merged_w %>%
  filter(week >= ymd("2021-12-31"))


tseries_w <- ts(df_sales_w_filtered$sales_w , start = c(2022, 1), frequency = 52)
head(tseries_w)
## Time Series:
## Start = c(2022, 1) 
## End = c(2022, 6) 
## Frequency = 52 
## [1]  5654618  5894371  8308052  9164178 11934123  9777570
seasonplot(tseries_w, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_w) - 1.5e7, labels = "2024", col = "blue")

# seasonplot monthly
df_sales_m_filtered <- df_merged_m %>%
  filter(month >= ymd("2021-12-31"))


tseries_m <- ts(df_sales_m_filtered$sales_m , start = c(2022, 1), frequency = 12)
head(tseries_m)
##           Jan      Feb      Mar      Apr      May      Jun
## 2022 31953282 41926179 37466677 57926246 79622843 88816435
seasonplot(tseries_m, col = rainbow(3), year.labels = TRUE, main = "Seasonal Plot")
text(x = 1, y = max(tseries_m) - 1e6, labels = "2024", col = "blue")

Density

# Montly Density
# Select the columns of interest
variables <- c("sales_m", "bar_m", "food_m", "rain_m", "fx_m", "google_m",
               "ise", "inflation", "unemployment", "temp_m", "prcp_m")


# Transform the data to long format for ggplot2
df_long_m <- df_merged_m %>%
  pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")

# Create the grid of density plots
ggplot(df_long_m, aes(x = Value)) +
  geom_density(fill = "blue", alpha = 0.4) +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  labs(title = "Density Plots of Selected Variables",
       x = "Value", y = "Density") +
  theme_minimal()

# Weekly Density
# Select the columns of interest
variables <- c("sales_w", "bar_w", "food_w", "rain_w", "fx_w", "google_w",
                "temp_w", "prcp_w")



df_long_w <- df_merged_w %>%
  pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")

# Create the grid of density plots
ggplot(df_long_w, aes(x = Value)) +
  geom_density(fill = "blue", alpha = 0.4) +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  labs(title = "Density Plots of Selected Variables",
       x = "Value", y = "Density") +
  theme_minimal()

# Daily Density

# Select the columns of interest
variables <- c("sales_cop", "bar", "food", "rain_sum", "fx", 
               "tmedian", "prcp")



df_long_d <- df_merged_d %>%
  pivot_longer(cols = all_of(variables), names_to = "Variable", values_to = "Value")

# Create the grid of density plots
ggplot(df_long_d, aes(x = Value)) +
  geom_density(fill = "blue", alpha = 0.4) +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  labs(title = "Density Plots of Selected Variables",
       x = "Value", y = "Density") +
  theme_minimal()

Covariates

### 3.5.1 economic variables-----------------------
# economic growth
ggplot(df_merged_m, aes(x=month, y=ise)) +
  geom_line() + ggtitle("Monthly activity in Colombia")

# clearly seasonal and trend

# fx
ggplot(df_merged_d, aes(x=date, y=fx)) +
  geom_line() + ggtitle("Daily COP/USD")

# trend but no clear seasonality

# inflation
ggplot(df_merged_m, aes(x=month, y=inflation)) +
  geom_line() + ggtitle("Monthly inflation National")

# business cycles, no tend or seasonality

# unemployment
ggplot(df_merged_m, aes(x=month, y=unemployment)) +
  geom_line() + ggtitle("Montly trailing unemployment Medellin")

# seasonal and trend downwards


### 3.5.2 Other variables

# google trends
ggplot(df_merged_w, aes(x=week, y=google_w)) +
  geom_line() + ggtitle("Weelkly Google trends 'Restaurantes'")

# no clear behaviour, drop in pandemic

# rain
ggplot(df_merged_d, aes(x=date, y=rain_sum)) +
  geom_line() + ggtitle("Daily rain approximated in Antioquia")

# no trend or seasonality clearly

# temperature
ggplot(df_merged_d, aes(x=date, y=tmedian)) +
  geom_line() + ggtitle("Daily Median temperature in Medellin")

# almost stationary

# temperature
ggplot(df_merged_d, aes(x=date, y=tavg)) +
  geom_line() + ggtitle("Daily Average temperature in Medellin")

# this one looks weird, better keep working on median

# precipitation from temp
ggplot(df_merged_d, aes(x=date, y=prcp)) +
  geom_line() + ggtitle("Daily  precipitation in Medellin")

# looks decent

Pairplots

df_merged_d <- subset(df_merged_d, select = -region)

# daily
ggpairs(df_merged_d, 
        columns = 2:8)

# sales have correl with fx and rain_sum
# weekly
ggpairs(df_merged_w, 
        columns = 2:9)

# sales have correl with rain, google, fx, temp
# bar has more correl with temp

# montly
ggpairs(df_merged_m, 
        columns = 2:12)

Correlation Matrix

# Exclude 'date' column
numeric_df_d <- df_merged_d[, sapply(df_merged_d, is.numeric)]
cor_matrix_d <- cor(numeric_df_d, use = "complete.obs")  # Use only complete rows
cor_matrix_d
##             sales_cop          bar        food   rain_sum          fx
## sales_cop 1.000000000  0.895118281  0.98135791  0.1662524  0.14677410
## bar       0.895118281  1.000000000  0.79301917  0.1275670  0.13671488
## food      0.981357913  0.793019172  1.00000000  0.1716482  0.14261504
## rain_sum  0.166252448  0.127567019  0.17164822  1.0000000  0.63500919
## fx        0.146774104  0.136714885  0.14261504  0.6350092  1.00000000
## tavg      0.008884659  0.089640797 -0.02562481 -0.1340051  0.16455022
## prcp      0.028085504 -0.001337552  0.03913167  0.1857676  0.05556529
## tmedian   0.108891610  0.111686146  0.10120437 -0.2717930 -0.07710937
##                   tavg         prcp     tmedian
## sales_cop  0.008884659  0.028085504  0.10889161
## bar        0.089640797 -0.001337552  0.11168615
## food      -0.025624811  0.039131672  0.10120437
## rain_sum  -0.134005147  0.185767588 -0.27179302
## fx         0.164550220  0.055565287 -0.07710937
## tavg       1.000000000 -0.267367632  0.65617098
## prcp      -0.267367632  1.000000000 -0.22103872
## tmedian    0.656170981 -0.221038724  1.00000000
numeric_df_w <- df_merged_w[, sapply(df_merged_w, is.numeric)]
cor_matrix_w <- cor(numeric_df_w, use = "complete.obs")  # Use only complete rows
cor_matrix_w
##              sales_w      bar_w      food_w     rain_w     google_w
## sales_w   1.00000000  0.8910033  0.98922211  0.2913161 -0.444675519
## bar_w     0.89100331  1.0000000  0.81506933  0.2875652 -0.355177208
## food_w    0.98922211  0.8150693  1.00000000  0.2785842 -0.453332380
## rain_w    0.29131607  0.2875652  0.27858415  1.0000000 -0.107348280
## google_w -0.44467552 -0.3551772 -0.45333238 -0.1073483  1.000000000
## fx_w      0.24395629  0.2873958  0.22038752  0.6626647 -0.002777267
## temp_w   -0.02746535  0.1275798 -0.07507291 -0.1062959  0.208497604
## prcp_w    0.17241188  0.1211778  0.18095801  0.3289820 -0.251630474
##                  fx_w      temp_w     prcp_w
## sales_w   0.243956293 -0.02746535  0.1724119
## bar_w     0.287395824  0.12757976  0.1211778
## food_w    0.220387520 -0.07507291  0.1809580
## rain_w    0.662664730 -0.10629590  0.3289820
## google_w -0.002777267  0.20849760 -0.2516305
## fx_w      1.000000000  0.19315256  0.1078585
## temp_w    0.193152564  1.00000000 -0.3047876
## prcp_w    0.107858510 -0.30478758  1.0000000
numeric_df_m <- df_merged_m[, sapply(df_merged_m, is.numeric)]
cor_matrix_m <- cor(numeric_df_m, use = "complete.obs")  # Use only complete rows
cor_matrix_m
##                  sales_m       bar_m     food_m       rain_m         fx_m
## sales_m       1.00000000  0.91772972  0.9930535  0.345901419  0.260081195
## bar_m         0.91772972  1.00000000  0.8646613  0.357818748  0.330438807
## food_m        0.99305351  0.86466133  1.0000000  0.331685570  0.232546131
## rain_m        0.34590142  0.35781875  0.3316856  1.000000000  0.700900212
## fx_m          0.26008119  0.33043881  0.2325461  0.700900212  1.000000000
## google_m     -0.60141163 -0.43583265 -0.6306910 -0.127328581 -0.016302700
## ise           0.14184921  0.09197376  0.1528346  0.143925313  0.006203153
## inflation     0.24620155  0.41865460  0.1885633  0.554477494  0.731585721
## unemployment -0.47774546 -0.35355440 -0.4999277 -0.314673431 -0.148857922
## temp_m       -0.05360312  0.12366320 -0.1041492 -0.004770513  0.242855832
## prcp_m        0.28057040  0.20114717  0.2956418  0.364914544  0.106392263
##                 google_m          ise  inflation unemployment       temp_m
## sales_m      -0.60141163  0.141849210  0.2462015 -0.477745462 -0.053603118
## bar_m        -0.43583265  0.091973761  0.4186546 -0.353554400  0.123663200
## food_m       -0.63069104  0.152834568  0.1885633 -0.499927664 -0.104149167
## rain_m       -0.12732858  0.143925313  0.5544775 -0.314673431 -0.004770513
## fx_m         -0.01630270  0.006203153  0.7315857 -0.148857922  0.242855832
## google_m      1.00000000  0.093304022  0.1600715  0.425504221  0.339768042
## ise           0.09330402  1.000000000 -0.1118127 -0.602525241 -0.190530186
## inflation     0.16007149 -0.111812691  1.0000000 -0.130801728  0.598250278
## unemployment  0.42550422 -0.602525241 -0.1308017  1.000000000 -0.007286939
## temp_m        0.33976804 -0.190530186  0.5982503 -0.007286939  1.000000000
## prcp_m       -0.37823361  0.047259121  0.0657641 -0.415992203 -0.280144464
##                   prcp_m
## sales_m       0.28057040
## bar_m         0.20114717
## food_m        0.29564181
## rain_m        0.36491454
## fx_m          0.10639226
## google_m     -0.37823361
## ise           0.04725912
## inflation     0.06576410
## unemployment -0.41599220
## temp_m       -0.28014446
## prcp_m        1.00000000
# Plot the Correlation Matrix
par(mfrow=c(1,1))
corrplot(cor_matrix_d, method = "color", type = "upper", tl.col = "black", tl.srt = 45)

corrplot(cor_matrix_w, method = "color", type = "upper", tl.col = "black", tl.srt = 45)

corrplot(cor_matrix_m, method = "color", type = "upper", tl.col = "black", tl.srt = 45)

Rain has stronger correlation than prcp, so we drop prcp to not repeat the same variable from two sources Also we drop average temperature because median temperature seems more trustworthy

# drop prcp beacuse they "are the same"
df_merged_m <- df_merged_m %>% select(-prcp_m)
df_merged_w <- df_merged_w %>% select(-prcp_w)
df_merged_d <- df_merged_d %>% select(-prcp)

# drop avg temp
df_merged_d <- df_merged_d %>% select(-tavg)
colnames(df_merged_d)
## [1] "date"      "sales_cop" "bar"       "food"      "rain_sum"  "fx"       
## [7] "tmedian"
### drop everything not on use
objects_to_keep <- c("df_merged_d", "df_merged_w", "df_merged_m")
# Remove all objects except those specified
rm(list = setdiff(ls(), objects_to_keep))

Variable Transformation

POSIXct and POSIXlt Classes

Times and date-times are represented by the POSIXct or the POSIXlt class in R. The POSIXct format stores date and time in seconds with the number of seconds beginning at January 1, 1970, so a POSIXct date-time is essentially an single value on a timeline. Date-times prior to 1970, will be negative numbers. The POSIXlt class stores other date and time information in a list such as hour of day of week, month of year, etc. The starting year for POSIXlt data is 1900, so 2022 would be stored as year 122. Months also begin at 0, so January is stored as month 0 and February as month 1. For both POSIX classes, the timezone can be classified. While date-times stored as POSIXct and POSIXlt look similar, when you unclass them with the unclass() function, you can see the additional information stored within the POSIXlt data.

Date Class

Dates without time can simply be stored as a Date class in R using the as.Date() function. Both Dates and POXIC classes need to be defined based on how they formatted. When uploading time series data into R, date and date-time data is typically uploaded as a character class and must be converted to date or time class using the as.Date(), as.POSIXct() or as.POSIXlt() functions.

Monthly

# Vars for model
# Month
# Ensure the `month` column is in POSIXct format
df_merged_m$month <- as.POSIXct(df_merged_m$month)

# Create the numeric variable: an evenly increasing number
df_merged_m <- df_merged_m %>%
  arrange(month) %>%  # Ensure data is sorted by month
  mutate(numeric_month = row_number())  # Assign an increasing number

# Create the seasonal variable: the 12 different months as a factor
df_merged_m <- df_merged_m %>%
  mutate(seasonal_month = factor(format(month, "%B"), levels = month.name))  # Month names as ordered factors

Weekly

# Week
# Ensure the `week` column is in POSIXct format
df_merged_w$week <- as.POSIXct(df_merged_w$week)

# Create the numeric variable: an evenly increasing number
df_merged_w <- df_merged_w %>%
  arrange(week) %>%  # Ensure data is sorted by week
  mutate(numeric_week = row_number())  # Assign an increasing number

# Create the seasonal variable: the 12 different months as a factor
df_merged_w <- df_merged_w %>%
  mutate(seasonal_month = factor(format(week, "%B"), levels = month.name))  # Month names as ordered factors

Daily

# Day
# Ensure the `day` column is in POSIXct format
df_merged_d$date <- as.POSIXct(df_merged_d$date)

# Create the numeric variable: an evenly increasing number
df_merged_d <- df_merged_d %>%
  arrange(date) %>%  # Ensure data is sorted by day
  mutate(numeric_day = row_number())  # Assign an increasing number
# Create the seasonal variable: the 12 different months as a factor
df_merged_d <- df_merged_d %>%
  mutate(seasonal_month = factor(format(date, "%B"), levels = month.name))  # Month names as ordered factors

# Create a column indicating the day of the week
df_merged_d <- df_merged_d %>%
  mutate(day_of_week = factor(weekdays(date), levels = c("Monday", "Tuesday", "Wednesday", 
                                                         "Thursday", "Friday", "Saturday", "Sunday")))  # Day of the week as ordered factor

Time Series Objects

Convert sales to time series objects for the use in several models

# convert to time series
sales_d_ts <- ts(df_merged_d$sales_cop)
sales_w_ts <- ts(df_merged_w$sales_w)
sales_m_ts <- ts(df_merged_m$sales_m)

par(mfrow=c(1,1))

# Daily
tsdisplay(sales_d_ts)

# is not stationary but has no clear trend
# and seasonality every 7 days

# Weekly
tsdisplay(sales_w_ts)

# not stationary: has trend

# Montly
tsdisplay(sales_m_ts)

# has clear trend, no seasonality

Log Transformation

Some variables are scaled to log, so we can interpret the linear models more easily. The covariates are in different scales so it is easier to interpret percentage changes instead of unit changes.

# Monthly
df_merged_m <- df_merged_m %>%
  mutate(across(where(is.numeric) & !all_of(c("unemployment", "inflation")), ~ log(. + 1)))

# Weekly
df_merged_w <- df_merged_w %>%
  mutate(across(where(is.numeric), ~ log(. + 1)))

# Daily
# Weekly
df_merged_d <- df_merged_d %>%
  mutate(across(where(is.numeric), ~ log(. + 1)))

Autocorrelation

#par(mfrow=c(1,1))
#tsdisplay(sales_d_ts)
# is not stationary but has no clear trend

plot(sales_d_ts)

acf(sales_d_ts)

pacf(sales_d_ts)

When data are seasonal, the autocorrelation will be larger for the seasonal lags (at multiples of the seasonal period) than for other lags.

# Weekly

#tsdisplay(sales_w_ts)
plot(sales_w_ts)

acf(sales_w_ts)

pacf(sales_w_ts)

# not stationary: has trend and seasonality maybe
# Montly

#tsdisplay(sales_m_ts)
plot(sales_m_ts)

acf(sales_m_ts)

pacf(sales_m_ts)

# has clear trend, no seasonality

Models

In this section we model the time series using various approaches to find the best model for our data. We use both linear and non linear models going from the simplest to the more “complex” models.

Helper functions

Functions that help us implement and analyze models faster

## Function to create and summarize models------------------
run_model <- function(formula, data, model_name) {
  cat("\nRunning", model_name, "\n")
  model <- lm(formula, data = data)
  print(summary(model))
  par(mfrow = c(2, 2))
  plot(model)
  return(model)
}

# Function to compare models using ANOVA
compare_models <- function(model1, model2, name1, name2) {
  cat("\nComparing Models:", name1, "vs", name2, "\n")
  anova_result <- anova(model1, model2)
  print(anova_result)
  return(anova_result)
}

# Function to add predictions to the dataset
add_predictions <- function(model, data, pred_column) {
  data[[pred_column]] <- predict(model, newdata = data)
  return(data)
}

# Calculate RMSE
# Function to calculate RMSE
calculate_rmse <- function(observed, predicted) {
  rmse <- sqrt(mean((observed - predicted)^2, na.rm = TRUE))
  return(rmse)
}


# function that compares linear models
# Define the function to get R^2 and AIC
get_model_stats <- function(models) {
  # Initialize an empty data frame
  stats <- data.frame(
    Model = character(),
    R2 = numeric(),
    AIC = numeric(),
    stringsAsFactors = FALSE
  )
  
  # Loop through the list of models
  for (i in seq_along(models)) {
    model <- models[[i]]
    model_name <- names(models)[i]
    # Extract R^2 and AIC
    r2 <- summary(model)$r.squared
    aic <- AIC(model)
    # Append to the data frame
    stats <- rbind(stats, data.frame(Model = model_name, R2 = r2, AIC = aic))
  }
  
  return(stats)
}

Linear models

Monthly

# Montly Models
# View Dataframe
head(df_merged_m)
# Model 0: Trend only
ols0 <- run_model(sales_m ~ numeric_month, df_merged_m, "Model 0")
## 
## Running Model 0 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9910 -0.1192  0.0158  0.1476  0.4953 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   16.40311    0.18031   90.97  < 2e-16 ***
## numeric_month  0.68309    0.06312   10.82 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.28 on 34 degrees of freedom
## Multiple R-squared:  0.775,  Adjusted R-squared:  0.7684 
## F-statistic: 117.1 on 1 and 34 DF,  p-value: 1.485e-12

df_merged_m <- add_predictions(ols0, df_merged_m, "predicted_sales0")

# Model 1: Trend + Seasonality
ols1 <- run_model(sales_m ~ numeric_month + seasonal_month, df_merged_m, "Model 1")
## 
## Running Model 1 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7403 -0.1590 -0.0135  0.2071  0.4347 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             16.505470   0.260998  63.240  < 2e-16 ***
## numeric_month            0.657412   0.075865   8.666 1.06e-08 ***
## seasonal_monthFebruary  -0.001699   0.254032  -0.007    0.995    
## seasonal_monthMarch     -0.006230   0.254345  -0.024    0.981    
## seasonal_monthApril     -0.063031   0.254777  -0.247    0.807    
## seasonal_monthMay        0.070611   0.255288   0.277    0.785    
## seasonal_monthJune       0.037438   0.255855   0.146    0.885    
## seasonal_monthJuly       0.138742   0.256462   0.541    0.594    
## seasonal_monthAugust    -0.010812   0.257098  -0.042    0.967    
## seasonal_monthSeptember -0.133330   0.257755  -0.517    0.610    
## seasonal_monthOctober   -0.058397   0.258427  -0.226    0.823    
## seasonal_monthNovember  -0.335279   0.254924  -1.315    0.201    
## seasonal_monthDecember  -0.016084   0.254094  -0.063    0.950    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.311 on 23 degrees of freedom
## Multiple R-squared:  0.8123, Adjusted R-squared:  0.7144 
## F-statistic: 8.296 on 12 and 23 DF,  p-value: 8.937e-06

df_merged_m <- add_predictions(ols1, df_merged_m, "predicted_sales1")


## Model 2: Backward Stepwise Regression 

# Start with the full model (excluding food and bar)
ols2_full <- lm(
  sales_m ~ numeric_month + seasonal_month + unemployment + ise + fx_m +
    google_m + temp_m + rain_m, 
  data = df_merged_m
)


# Perform backward stepwise regression
ols2_stepwise <- step(
  ols2_full, 
  direction = "backward",
  trace = 1 # Prints the stepwise regression process
)
## Start:  AIC=-104.31
## sales_m ~ numeric_month + seasonal_month + unemployment + ise + 
##     fx_m + google_m + temp_m + rain_m
## 
##                  Df Sum of Sq     RSS      AIC
## - temp_m          1   0.00008 0.69105 -106.310
## - unemployment    1   0.01523 0.70620 -105.530
## <none>                        0.69097 -104.314
## - rain_m          1   0.07578 0.76674 -102.568
## - seasonal_month 11   0.71692 1.40789 -100.691
## - ise             1   0.17179 0.86276  -98.321
## - fx_m            1   0.39083 1.08180  -90.176
## - google_m        1   0.57575 1.26672  -84.495
## - numeric_month   1   2.29957 2.99054  -53.570
## 
## Step:  AIC=-106.31
## sales_m ~ numeric_month + seasonal_month + unemployment + ise + 
##     fx_m + google_m + rain_m
## 
##                  Df Sum of Sq     RSS      AIC
## - unemployment    1   0.02314 0.71419 -107.125
## <none>                        0.69105 -106.310
## - rain_m          1   0.07693 0.76798 -104.510
## - seasonal_month 11   0.79109 1.48214 -100.841
## - ise             1   0.17775 0.86880 -100.070
## - fx_m            1   0.43032 1.12137  -90.883
## - google_m        1   0.65543 1.34648  -84.297
## - numeric_month   1   2.42508 3.11613  -54.089
## 
## Step:  AIC=-107.12
## sales_m ~ numeric_month + seasonal_month + ise + fx_m + google_m + 
##     rain_m
## 
##                  Df Sum of Sq    RSS      AIC
## <none>                        0.7142 -107.125
## - rain_m          1    0.0940 0.8082 -104.673
## - seasonal_month 11    0.7847 1.4989 -102.437
## - ise             1    0.1606 0.8748 -101.823
## - fx_m            1    0.4343 1.1485  -92.022
## - google_m        1    0.6466 1.3608  -85.916
## - numeric_month   1    4.1118 4.8260  -40.342
# Summary of the final stepwise model
summary(ols2_stepwise)
## 
## Call:
## lm(formula = sales_m ~ numeric_month + seasonal_month + ise + 
##     fx_m + google_m + rain_m, data = df_merged_m)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31229 -0.08760 -0.01030  0.09029  0.31789 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -52.31160   19.99397  -2.616 0.016981 *  
## numeric_month             0.93626    0.08952  10.459 2.54e-09 ***
## seasonal_monthFebruary    0.27615    0.19812   1.394 0.179459    
## seasonal_monthMarch      -0.33136    0.28825  -1.150 0.264583    
## seasonal_monthApril      -0.03032    0.20910  -0.145 0.886232    
## seasonal_monthMay        -0.12629    0.27633  -0.457 0.652850    
## seasonal_monthJune       -0.16788    0.27401  -0.613 0.547361    
## seasonal_monthJuly       -0.11655    0.30708  -0.380 0.708483    
## seasonal_monthAugust     -0.45680    0.33544  -1.362 0.189189    
## seasonal_monthSeptember  -0.40348    0.27930  -1.445 0.164861    
## seasonal_monthOctober    -0.21625    0.28064  -0.771 0.450439    
## seasonal_monthNovember   -0.67606    0.38426  -1.759 0.094603 .  
## seasonal_monthDecember   -1.52898    0.67026  -2.281 0.034247 *  
## ise                       7.20920    3.48786   2.067 0.052641 .  
## fx_m                      2.72323    0.80115   3.399 0.003010 ** 
## google_m                  3.17853    0.76636   4.148 0.000547 ***
## rain_m                   -0.20297    0.12835  -1.581 0.130285    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1939 on 19 degrees of freedom
## Multiple R-squared:  0.9397, Adjusted R-squared:  0.889 
## F-statistic: 18.52 on 16 and 19 DF,  p-value: 2.609e-08
plot(ols2_stepwise)

# Add predictions from the final stepwise model
df_merged_m <- add_predictions(ols2_stepwise, df_merged_m, "predicted_sales2")


# Plot Actual vs Predicted Values 

plot_colors <- c("black", "red", "blue", "darkgreen")
line_types <- c(1, 2, 3,4)  # Solid, dashed, and dotted lines

par(mfrow = c(1,1))
# the base plot for actual sales
plot(
  df_merged_m$month, exp(df_merged_m$sales_m),
  type = "p",  # Points for actual sales
  col = "black", pch = 16,
  xlab = "Month",
  ylab = "Sales",
  main = "Actual vs Predicted Monthly Sales for All Models"
)

# Add the lines for each model
lines(
  df_merged_m$month, exp(df_merged_m$predicted_sales0),
  col = plot_colors[2],
  lty = line_types[2],
  lwd = 2
)
lines(
  df_merged_m$month, exp(df_merged_m$predicted_sales1),
  col = plot_colors[3],
  lty = line_types[3],
  lwd = 2
)
lines(
  df_merged_m$month, exp(df_merged_m$predicted_sales2),
  col = plot_colors[4],
  lty = 4,
  lwd = 2
)

# Add a legend
legend(
  "bottomright",
  legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
  col = c("black", plot_colors[2:4]),
  lty = c(NA, line_types[2:4]),
  pch = c(16, NA, NA, NA),
  bty = "n"
)

# Models to compare
models <- list(
  "Model trend" = ols0,
  "Model trend + season" = ols1,
  "Model all covariates step" = ols2_stepwise
)

# Get R^2 and AIC for each model
model_stats <- get_model_stats(models)


#  RMSE calculation for the original (exponentiated) scale
rmse_stats <- data.frame(
  Model = character(),
  RMSE = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each model
for (i in seq_along(models)) {
  model_name <- names(models)[i]
  predicted_column <- paste0("predicted_sales", i - 1)  # Adjust column name index
  
  # Calculate RMSE on the original scale
  rmse <- calculate_rmse(
    observed = exp(df_merged_m$sales_m),          # Exponentiate actual values
    predicted = exp(df_merged_m[[predicted_column]])  # Exponentiate predicted values
  )
  
  # Append results to the RMSE stats table
  rmse_stats <- rbind(rmse_stats, data.frame(Model = model_name, RMSE = rmse))
}

# Combine R^2, AIC, and RMSE into one table
combined_stats <- merge(model_stats, rmse_stats, by = "Model")

# View the combined table
print(combined_stats)
##                       Model        R2       AIC     RMSE
## 1 Model all covariates step 0.9397340 -2.961086 13229295
## 2               Model trend 0.7750083 14.461543 21374338
## 3      Model trend + season 0.8123201 29.933834 21523336
rmse_ols_m <- rmse_stats$RMSE[3]
rmse_ols_m
## [1] 13229295
# check teh residuals of the best model
checkresiduals(ols2_stepwise$residuals)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 9.5442, df = 7, p-value = 0.2159
## 
## Model df: 0.   Total lags used: 7

Weekly

# Weekly Models
head(df_merged_w)
## Clean Data - Drop rows 1-2 because sales are 0 / was not open yet
df_merged_w <- df_merged_w %>% slice(-1, -2)

## Model 0A: Trend only
ols0w <- run_model(sales_w ~ numeric_week, df_merged_w, "Model 0A")
## 
## Running Model 0A 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72559 -0.15917  0.01874  0.16674  0.65873 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.63582    0.10036  145.84   <2e-16 ***
## numeric_week  0.53273    0.02372   22.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2406 on 153 degrees of freedom
## Multiple R-squared:  0.7673, Adjusted R-squared:  0.7658 
## F-statistic: 504.5 on 1 and 153 DF,  p-value: < 2.2e-16

df_merged_w <- add_predictions(ols0w, df_merged_w, "predicted_sales0")

## Model 1A: Trend + Seasonality
ols1w <- run_model(sales_w ~ numeric_week + seasonal_month, df_merged_w, "Model 1A")
## 
## Running Model 1A 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62472 -0.14172  0.01215  0.16457  0.57707 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             14.67383    0.11499 127.613   <2e-16 ***
## numeric_week             0.53053    0.02537  20.915   <2e-16 ***
## seasonal_monthFebruary   0.01863    0.09432   0.197    0.844    
## seasonal_monthMarch     -0.07386    0.09266  -0.797    0.427    
## seasonal_monthApril     -0.04300    0.09278  -0.463    0.644    
## seasonal_monthMay        0.05120    0.09279   0.552    0.582    
## seasonal_monthJune       0.03179    0.09343   0.340    0.734    
## seasonal_monthJuly       0.09360    0.09164   1.021    0.309    
## seasonal_monthAugust    -0.03924    0.09572  -0.410    0.683    
## seasonal_monthSeptember -0.12111    0.09425  -1.285    0.201    
## seasonal_monthOctober   -0.08215    0.09245  -0.889    0.376    
## seasonal_monthNovember  -0.05447    0.09654  -0.564    0.574    
## seasonal_monthDecember  -0.13661    0.09221  -1.481    0.141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2394 on 142 degrees of freedom
## Multiple R-squared:  0.7862, Adjusted R-squared:  0.7681 
## F-statistic:  43.5 on 12 and 142 DF,  p-value: < 2.2e-16

df_merged_w <- add_predictions(ols1w, df_merged_w, "predicted_sales1")


## Model 2A: Experimentation


# Start with the full model (excluding food and bar)
ols2_full_w <- lm(
  sales_w ~ numeric_week + seasonal_month + fx_w +
    google_w + temp_w + rain_w, 
  data = df_merged_w
)


# Perform backward stepwise regression
ols2_stepwise_w <- step(
  ols2_full_w, 
  direction = "backward",
  trace = 1 # Prints the stepwise regression process
)
## Start:  AIC=-470.72
## sales_w ~ numeric_week + seasonal_month + fx_w + google_w + temp_w + 
##     rain_w
## 
##                  Df Sum of Sq     RSS     AIC
## - google_w        1    0.0047  5.9773 -472.59
## - temp_w          1    0.0524  6.0250 -471.36
## <none>                         5.9726 -470.72
## - seasonal_month 11    0.9898  6.9624 -468.95
## - rain_w          1    0.1643  6.1370 -468.51
## - fx_w            1    1.1370  7.1096 -445.71
## - numeric_week    1   16.1784 22.1510 -269.56
## 
## Step:  AIC=-472.59
## sales_w ~ numeric_week + seasonal_month + fx_w + temp_w + rain_w
## 
##                  Df Sum of Sq     RSS     AIC
## - temp_w          1    0.0597  6.0370 -473.05
## <none>                         5.9773 -472.59
## - seasonal_month 11    0.9855  6.9628 -470.94
## - rain_w          1    0.1652  6.1426 -470.37
## - fx_w            1    1.1490  7.1264 -447.34
## - numeric_week    1   19.7232 25.7005 -248.52
## 
## Step:  AIC=-473.05
## sales_w ~ numeric_week + seasonal_month + fx_w + rain_w
## 
##                  Df Sum of Sq     RSS     AIC
## <none>                         6.0370 -473.05
## - rain_w          1    0.2277  6.2648 -469.31
## - seasonal_month 11    1.3008  7.3379 -464.81
## - fx_w            1    1.6010  7.6380 -438.59
## - numeric_week    1   20.3284 26.3655 -246.56
# Summary of the final stepwise model
summary(ols2_stepwise_w)
## 
## Call:
## lm(formula = sales_w ~ numeric_week + seasonal_month + fx_w + 
##     rain_w, data = df_merged_w)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5797 -0.1243  0.0106  0.1247  0.5622 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.941702   2.597439  -0.748   0.4560    
## numeric_week             0.542540   0.024988  21.712  < 2e-16 ***
## seasonal_monthFebruary   0.003119   0.082343   0.038   0.9698    
## seasonal_monthMarch     -0.008609   0.081117  -0.106   0.9156    
## seasonal_monthApril      0.058496   0.084093   0.696   0.4878    
## seasonal_monthMay        0.160561   0.085157   1.885   0.0614 .  
## seasonal_monthJune       0.171277   0.088309   1.940   0.0544 .  
## seasonal_monthJuly       0.140843   0.079982   1.761   0.0804 .  
## seasonal_monthAugust    -0.015579   0.083209  -0.187   0.8518    
## seasonal_monthSeptember -0.130180   0.082537  -1.577   0.1170    
## seasonal_monthOctober   -0.116443   0.081307  -1.432   0.1543    
## seasonal_monthNovember  -0.011869   0.090153  -0.132   0.8954    
## seasonal_monthDecember  -0.096493   0.081089  -1.190   0.2361    
## fx_w                     2.106952   0.345784   6.093 1.01e-08 ***
## rain_w                  -0.121385   0.052818  -2.298   0.0230 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2077 on 140 degrees of freedom
## Multiple R-squared:  0.8414, Adjusted R-squared:  0.8255 
## F-statistic: 53.04 on 14 and 140 DF,  p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_w <- add_predictions(ols2_stepwise_w, df_merged_w, "predicted_sales2")

# Plot Actual vs Predicted Values
# Plot Actual vs Predicted Values 

par(mfrow = c(1,1))
plot(
  df_merged_w$week, exp(df_merged_w$sales_w),
  type = "p",  # Points for actual sales
  col = "black", pch = 16,
  xlab = "Week",
  ylab = "Sales",
  main = "Actual vs Predicted Weekly Sales for All Models"
)

# Add the lines for each model
lines(
  df_merged_w$week, exp(df_merged_w$predicted_sales0),
  col = plot_colors[2],
  lty = line_types[2],
  lwd = 2
)
lines(
  df_merged_w$week, exp(df_merged_w$predicted_sales1),
  col = plot_colors[3],
  lty = line_types[3],
  lwd = 2
)
lines(
  df_merged_w$week, exp(df_merged_w$predicted_sales2),
  col = plot_colors[4],
  lty = 4,
  lwd = 2
)

# Add a legend
legend(
  "bottomright",
  legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
  col = c("black", plot_colors[2:4]),
  lty = c(NA, line_types[2:4]),
  pch = c(16, NA, NA, NA),
  bty = "n"
)

# Models to compare
models_w <- list(
  "Model trend" = ols0w,
  "Model trend + season" = ols1w,
  "Model all covariates step" = ols2_stepwise_w
)

# Models to compare
models_w <- list(
  "Model trend" = ols0w,
  "Model trend + season" = ols1w,
  "Model all covariates step" = ols2_stepwise_w
)

# Get R^2 and AIC for each model
model_stats_w <- get_model_stats(models_w)

# View the results
print(model_stats_w)
##                       Model        R2        AIC
## 1               Model trend 0.7672955   2.217403
## 2      Model trend + season 0.7861620  11.112056
## 3 Model all covariates step 0.8413760 -31.183618
rmse_stats_w <- data.frame(
  Model = character(),
  RMSE = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each model
for (i in seq_along(models_w)) {
  model_name <- names(models_w)[i]
  predicted_column <- paste0("predicted_sales", i - 1)  # Adjust column name index
  
  # Calculate RMSE on the original scale
  rmse <- calculate_rmse(
    observed = exp(df_merged_w$sales_w),          # Exponentiate actual values
    predicted = exp(df_merged_w[[predicted_column]])  # Exponentiate predicted values
  )
  
  # Append results to the RMSE stats table
  rmse_stats_w <- rbind(rmse_stats_w, data.frame(Model = model_name, RMSE = rmse))
}

# Combine R^2, AIC, and RMSE into one table
combined_stats_w <- merge(model_stats_w, rmse_stats_w, by = "Model")

# View the combined table
print(combined_stats_w)
##                       Model        R2        AIC    RMSE
## 1 Model all covariates step 0.8413760 -31.183618 4000370
## 2               Model trend 0.7672955   2.217403 4745735
## 3      Model trend + season 0.7861620  11.112056 4668952
rmse_ols_w <- rmse_stats_w$RMSE[3]
rmse_ols_w
## [1] 4000370
checkresiduals(ols2_stepwise_w$residuals)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 80.439, df = 10, p-value = 4.118e-13
## 
## Model df: 0.   Total lags used: 10

Residuals are autocorrelated and have structure

Daily

# Daily Models
head(df_merged_d,25)
# properly start in december
df_merged_d <-  df_merged_d %>%
  filter(date > "2021-11-30")
head(df_merged_d)
## Model 0: Trend only
ols0d <- run_model(sales_cop ~ numeric_day, df_merged_d, "Model 0A")
## 
## Running Model 0A 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2083  -0.1814   0.2027   0.5771   1.6749 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.5199     0.4261  24.688   <2e-16 ***
## numeric_day   0.6774     0.0695   9.746   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.762 on 1038 degrees of freedom
## Multiple R-squared:  0.08383,    Adjusted R-squared:  0.08295 
## F-statistic: 94.98 on 1 and 1038 DF,  p-value: < 2.2e-16

df_merged_d <- add_predictions(ols0d, df_merged_d, "predicted_sales0")

## Model 1: Trend + Seasonality
ols1d <- run_model(sales_cop ~ numeric_day + seasonal_month + day_of_week, df_merged_d, "Model 1A")
## 
## Running Model 1A 
## 
## Call:
## lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1930  -0.1446   0.1520   0.4321   2.1992 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             10.41648    0.46748  22.282  < 2e-16 ***
## numeric_day              0.60222    0.07264   8.290 3.54e-16 ***
## seasonal_monthFebruary   0.37785    0.26287   1.437  0.15091    
## seasonal_monthMarch      0.24291    0.25677   0.946  0.34435    
## seasonal_monthApril      0.21725    0.26027   0.835  0.40407    
## seasonal_monthMay        0.03779    0.25763   0.147  0.88342    
## seasonal_monthJune       0.15499    0.26174   0.592  0.55388    
## seasonal_monthJuly       0.23454    0.25933   0.904  0.36599    
## seasonal_monthAugust     0.25260    0.26021   0.971  0.33190    
## seasonal_monthSeptember -0.03710    0.26437  -0.140  0.88843    
## seasonal_monthOctober    0.23107    0.26387   0.876  0.38140    
## seasonal_monthNovember   0.08953    0.29411   0.304  0.76087    
## seasonal_monthDecember  -0.71856    0.25824  -2.783  0.00549 ** 
## day_of_weekTuesday      -0.06744    0.19830  -0.340  0.73385    
## day_of_weekWednesday     0.22634    0.19767   1.145  0.25247    
## day_of_weekThursday      0.47521    0.19734   2.408  0.01621 *  
## day_of_weekFriday        1.10918    0.19862   5.584 3.01e-08 ***
## day_of_weekSaturday      0.93769    0.19827   4.729 2.57e-06 ***
## day_of_weekSunday        0.61578    0.19826   3.106  0.00195 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.705 on 1021 degrees of freedom
## Multiple R-squared:  0.1557, Adjusted R-squared:  0.1408 
## F-statistic: 10.46 on 18 and 1021 DF,  p-value: < 2.2e-16

df_merged_d <- add_predictions(ols1d, df_merged_d, "predicted_sales1")

# Model 2: Backward
head(df_merged_d)
# Start with the full model (excluding food and bar)
ols2_full_d <- lm(
  sales_cop ~ numeric_day + seasonal_month + day_of_week + fx +
     tmedian + rain_sum, 
  data = df_merged_d
)
summary(ols2_full_d)
## 
## Call:
## lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week + 
##     fx + tmedian + rain_sum, data = df_merged_d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2560  -0.1301   0.1434   0.4180   2.2430 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.681458   7.563025   0.487  0.62653    
## numeric_day              0.555771   0.078969   7.038 3.58e-12 ***
## seasonal_monthFebruary   0.409437   0.263765   1.552  0.12091    
## seasonal_monthMarch      0.254867   0.257099   0.991  0.32176    
## seasonal_monthApril      0.215634   0.264563   0.815  0.41523    
## seasonal_monthMay        0.002179   0.263006   0.008  0.99339    
## seasonal_monthJune       0.125315   0.268566   0.467  0.64088    
## seasonal_monthJuly       0.230188   0.260048   0.885  0.37627    
## seasonal_monthAugust     0.284052   0.260701   1.090  0.27616    
## seasonal_monthSeptember  0.004766   0.265632   0.018  0.98569    
## seasonal_monthOctober    0.152011   0.266104   0.571  0.56796    
## seasonal_monthNovember  -0.092994   0.303301  -0.307  0.75921    
## seasonal_monthDecember  -0.744359   0.258962  -2.874  0.00413 ** 
## day_of_weekTuesday      -0.063129   0.198020  -0.319  0.74994    
## day_of_weekWednesday     0.237591   0.197534   1.203  0.22934    
## day_of_weekThursday      0.471793   0.197065   2.394  0.01684 *  
## day_of_weekFriday        1.102140   0.198329   5.557 3.50e-08 ***
## day_of_weekSaturday      0.930391   0.198011   4.699 2.98e-06 ***
## day_of_weekSunday        0.615126   0.198134   3.105  0.00196 ** 
## fx                       0.787878   0.880843   0.894  0.37129    
## tmedian                 -0.143825   0.999347  -0.144  0.88559    
## rain_sum                 0.137507   0.101046   1.361  0.17387    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.703 on 1018 degrees of freedom
## Multiple R-squared:  0.1609, Adjusted R-squared:  0.1436 
## F-statistic: 9.294 on 21 and 1018 DF,  p-value: < 2.2e-16
# Perform backward stepwise regression
ols2_stepwise_d <- step(
  ols2_full_d, 
  direction = "backward",
  trace = 1 # Prints the stepwise regression process
)
## Start:  AIC=1128.63
## sales_cop ~ numeric_day + seasonal_month + day_of_week + fx + 
##     tmedian + rain_sum
## 
##                  Df Sum of Sq    RSS    AIC
## - tmedian         1     0.060 2951.0 1126.7
## - fx              1     2.319 2953.3 1127.5
## - rain_sum        1     5.368 2956.3 1128.5
## <none>                        2951.0 1128.6
## - seasonal_month 11    79.118 3030.1 1134.2
## - numeric_day     1   143.580 3094.6 1176.0
## - day_of_week     6   175.600 3126.6 1176.8
## 
## Step:  AIC=1126.65
## sales_cop ~ numeric_day + seasonal_month + day_of_week + fx + 
##     rain_sum
## 
##                  Df Sum of Sq    RSS    AIC
## - fx              1     2.277 2953.3 1125.5
## <none>                        2951.0 1126.7
## - rain_sum        1     6.045 2957.1 1126.8
## - seasonal_month 11    79.429 3030.5 1132.3
## - day_of_week     6   175.581 3126.6 1174.8
## - numeric_day     1   146.691 3097.7 1175.1
## 
## Step:  AIC=1125.45
## sales_cop ~ numeric_day + seasonal_month + day_of_week + rain_sum
## 
##                  Df Sum of Sq    RSS    AIC
## <none>                        2953.3 1125.5
## - rain_sum        1    15.758 2969.1 1129.0
## - seasonal_month 11    79.049 3032.4 1130.9
## - numeric_day     1   144.596 3097.9 1173.2
## - day_of_week     6   175.164 3128.5 1173.4
# Summary of the final stepwise model
summary(ols2_stepwise_d)
## 
## Call:
## lm(formula = sales_cop ~ numeric_day + seasonal_month + day_of_week + 
##     rain_sum, data = df_merged_d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1816  -0.1390   0.1437   0.4305   2.3591 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.54422    0.59782  15.965  < 2e-16 ***
## numeric_day              0.54288    0.07682   7.067 2.93e-12 ***
## seasonal_monthFebruary   0.42246    0.26300   1.606  0.10851    
## seasonal_monthMarch      0.24004    0.25622   0.937  0.34905    
## seasonal_monthApril      0.17428    0.26036   0.669  0.50339    
## seasonal_monthMay       -0.03250    0.25883  -0.126  0.90009    
## seasonal_monthJune       0.07827    0.26324   0.297  0.76626    
## seasonal_monthJuly       0.21366    0.25892   0.825  0.40946    
## seasonal_monthAugust     0.27803    0.25988   1.070  0.28494    
## seasonal_monthSeptember  0.01683    0.26481   0.064  0.94933    
## seasonal_monthOctober    0.17391    0.26444   0.658  0.51091    
## seasonal_monthNovember  -0.08671    0.30304  -0.286  0.77483    
## seasonal_monthDecember  -0.75067    0.25804  -2.909  0.00370 ** 
## day_of_weekTuesday      -0.06287    0.19788  -0.318  0.75076    
## day_of_weekWednesday     0.24147    0.19735   1.224  0.22139    
## day_of_weekThursday      0.47065    0.19692   2.390  0.01703 *  
## day_of_weekFriday        1.10202    0.19821   5.560 3.45e-08 ***
## day_of_weekSaturday      0.92892    0.19788   4.694 3.04e-06 ***
## day_of_weekSunday        0.61671    0.19783   3.117  0.00188 ** 
## rain_sum                 0.18939    0.08118   2.333  0.01985 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.702 on 1020 degrees of freedom
## Multiple R-squared:  0.1602, Adjusted R-squared:  0.1446 
## F-statistic: 10.24 on 19 and 1020 DF,  p-value: < 2.2e-16
# Add predictions from the final stepwise model
df_merged_d <- add_predictions(ols2_stepwise_d, df_merged_d, "predicted_sales2")

# Plot Actual vs Predicted Values
par(mfrow = c(1,1))
plot(
  df_merged_d$date, exp(df_merged_d$sales_cop),
  type = "p",  # Points for actual sales
  col = "black", pch = 16,
  xlab = "Date",
  ylab = "Sales",
  main = "Actual vs Predicted Daily Sales for All Models"
)

# Add the lines for each model
lines(
  df_merged_d$date, exp(df_merged_d$predicted_sales0),
  col = plot_colors[2],
  lty = line_types[2],
  lwd = 2
)
lines(
  df_merged_d$date, exp(df_merged_d$predicted_sales1),
  col = plot_colors[3],
  lty = line_types[3],
  lwd = 2
)
lines(
  df_merged_d$date, exp(df_merged_d$predicted_sales2),
  col = plot_colors[4],
  lty = 4,
  lwd = 2
)

# Add a legend
legend(
  "topleft",
  legend = c("Actual Sales", "Model 0", "Model 1", "Model 2 Stepwise"),
  col = c("black", plot_colors[2:4]),
  lty = c(NA, line_types[2:4]),
  pch = c(16, NA, NA, NA),
  bty = "n"
)

# Models to compare
models_d <- list(
  "Model trend" = ols0d,
  "Model trend + season" = ols1d,
  "Model all covariates step" = ols2_stepwise_d
)


# Models to compare
models_d <- list(
  "Model trend" = ols0d,
  "Model trend + season" = ols1d,
  "Model all covariates step" = ols2_stepwise_d
)

# Get R^2 and AIC for each model
model_stats_d <- get_model_stats(models_d)


#  RMSE calculation for the original (exponentiated) scale for daily models
rmse_stats_d <- data.frame(
  Model = character(),
  RMSE = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each model
for (i in seq_along(models_d)) {
  model_name <- names(models_d)[i]
  predicted_column <- paste0("predicted_sales", i - 1)  # Adjust column name index
  
  # Calculate RMSE on the original scale
  rmse <- calculate_rmse(
    observed = exp(df_merged_d$sales_cop),          # Exponentiate actual values
    predicted = exp(df_merged_d[[predicted_column]])  # Exponentiate predicted values
  )
  
  # Append results to the RMSE stats table
  rmse_stats_d <- rbind(rmse_stats_d, data.frame(Model = model_name, RMSE = rmse))
}

# Combine R^2, AIC, and RMSE into one table
combined_stats_d <- merge(model_stats_d, rmse_stats_d, by = "Model")

# View the combined table
print(combined_stats_d)
##                       Model         R2      AIC    RMSE
## 1 Model all covariates step 0.16020994 4078.846 1421301
## 2               Model trend 0.08382823 4133.380 1845259
## 3      Model trend + season 0.15572908 4082.381 1421549
rmse_ols_d <- rmse_stats_d$RMSE[3]
rmse_ols_d
## [1] 1421301
checkresiduals(ols2_stepwise_d$residuals)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 21.421, df = 10, p-value = 0.01834
## 
## Model df: 0.   Total lags used: 10

Residuals have autocorrelation and some are too far from mean.

Non Linear Models

Here we explore non linear models, starting from the simplest to more elaborate models in the end combining some of the used models.

Wrangle data for models

The time series are altered so the visualizations are more understandable, basically we change the date index in the timeseries objects

# re-declare time-series beacause we droped some rows:
# Ensure the 'date' columns are in Date format
df_merged_d$date <- as.Date(df_merged_d$date)
df_merged_w$date <- as.Date(df_merged_w$week)
df_merged_m$date <- as.Date(df_merged_m$month)

# Extract the start date and year for each dataframe
start_d <- min(df_merged_d$date)
start_w <- min(df_merged_w$date)
start_m <- min(df_merged_m$date)

# Extract components for daily, weekly, and monthly start times
start_d_year <- as.numeric(format(start_d, "%Y"))
start_d_day <- as.numeric(format(start_d, "%j")) # Day of the year

start_w_year <- as.numeric(format(start_w, "%Y"))
start_w_week <- as.numeric(format(start_w, "%U")) + 1 # Week number, adding 1 since R starts at week 0

start_m_year <- as.numeric(format(start_m, "%Y"))
start_m_month <- as.numeric(format(start_m, "%m"))

# Declare time series with appropriate frequencies
sales_d_ts <- ts(exp(df_merged_d$sales_cop), start = c(start_d_year, start_d_day), frequency = 365)
sales_w_ts <- ts(exp(df_merged_w$sales_w), start = c(start_w_year, start_w_week), frequency = 52)
sales_m_ts <- ts(exp(df_merged_m$sales_m), start = c(start_m_year, start_m_month), frequency = 12)

food_d_ts <- ts(exp(df_merged_d$food), start = c(start_d_year, start_d_day), frequency = 365)
food_w_ts <- ts(exp(df_merged_w$food_w), start = c(start_w_year, start_w_week), frequency = 52)
food_m_ts <- ts(exp(df_merged_m$food_m), start = c(start_m_year, start_m_month), frequency = 12)

bar_d_ts <- ts(exp(df_merged_d$bar), start = c(start_d_year, start_d_day), frequency = 365)
bar_w_ts <- ts(exp(df_merged_w$bar_w), start = c(start_w_year, start_w_week), frequency = 52)
bar_m_ts <- ts(exp(df_merged_m$bar_m), start = c(start_m_year, start_m_month), frequency = 12)
# Verify the created time series
par(mfrow=c(1,1))
plot(sales_d_ts)

plot(sales_w_ts)

plot(sales_m_ts)

plot(food_d_ts)

plot(food_w_ts)

plot(food_m_ts)

plot(bar_d_ts)

plot(bar_w_ts)

plot(bar_m_ts)

Here we fill the sales = 0 values with the mean of the two adjacent dates. This in order to have smoother models. The dates with sales = 0 are dates that are national holiday like christmas or new years, or inventory day in which the kitchen cannot operate so the sales are 0.

# Function to replace 1s with the mean of previous and next observations
fill_ones <- function(ts_data) {
  # Convert time series to numeric vector
  ts_vec <- as.numeric(ts_data)
  
  # Loop through and replace 1s
  for (i in seq_along(ts_vec)) {
    if (ts_vec[i] == 1) {
      # Check boundaries to avoid indexing issues
      prev_val <- ifelse(i > 1, ts_vec[i - 1], NA)
      next_val <- ifelse(i < length(ts_vec), ts_vec[i + 1], NA)
      
      # Replace with mean of previous and next, ignoring NA
      ts_vec[i] <- mean(c(prev_val, next_val), na.rm = TRUE)
    }
  }
  
  # Return as time series with original attributes
  ts(ts_vec, start = start(ts_data), frequency = frequency(ts_data))
}

# Apply the function 
sales_d_ts <- fill_ones(sales_d_ts)
sales_w_ts <- fill_ones(sales_w_ts)
sales_m_ts <- fill_ones(sales_m_ts)


food_d_ts <- fill_ones(food_d_ts)
food_w_ts <- fill_ones(food_w_ts)
food_m_ts <- fill_ones(food_m_ts)

bar_d_ts <- fill_ones(bar_d_ts)
bar_w_ts <- fill_ones(bar_w_ts)
bar_m_ts <- fill_ones(bar_m_ts)

Bass Model

# Some simple plots
plot(sales_m_ts)

plot(cumsum(sales_m_ts)) #Returns a vector whose elements are the cumulative sums

# Bass model
bm_m<-BM(sales_m_ts,display = T) # show graphical view of results / display = True

summary(bm_m)
## Call: ( Standard Bass Model )
## 
##   BM(series = sales_m_ts, display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -94382506 -37311459 -12636337 -10394220  24066002  75506534 
## 
## Coefficients:
##        Estimate    Std.Error        Lower        Upper  p-value    
## m 4.664349e+09 2.000317e+08 4.272294e+09 5.056404e+09 4.48e-22 ***
## p 8.332383e-03 2.359472e-04 7.869934e-03 8.794831e-03 8.52e-28 ***
## q 8.966748e-02 5.371447e-03 7.913963e-02 1.001953e-01 1.18e-17 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  47282474  on  33  degrees of freedom
##  Multiple R-squared:   0.999374  Residual sum of squares:  7.377587e+16
bm_m$coefficients['m'] - sum(sales_m_ts)
##          m 
## 1148560231

According to this, there are only 1m cop left to sell, this is less than a year / seems wrong. Fits well but the 30- onward is wierd + sales might not be declining yet. Still reflects the innovation and copying in some sense

Also the restaurants rely in word of mouth to reach full stage m = 4.664.000.000 COP, i.e 1 mm EUR approx. / The restaurant has sold 3.515.788.885/ According to this only in 1 year it should extinguish sells p, innovation: 0.832% indicates that the adoption rate due to external influence is relatively low, but not uncommon for many markets. - it is actually relativly innovative q: (8.96%) suggests that imitation plays a larger role than innovation in driving adoption in this market

pred_bm_m<- predict(bm_m, newx=c(1:length(sales_m_ts)))
pred_bm_m <- ts(pred_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_bm_m <- make.instantaneous(pred_bm_m)
pred.inst_bm_m <- ts(pred.inst_bm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))

# plot
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales")

# Add the fitted values as a line
lines(pred.inst_bm_m, col = "red", lwd = 2)

# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

# check residuals
res_bm_m <- sales_m_ts - pred.inst_bm_m
tsdisplay(res_bm_m)

Residuals have some structure and 2 lag has correlation.

# Calculate RMSE for Bass Model predictions
rmse_bm_m <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_bm_m)

# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_m, "\n")
## RMSE for Bass Model Predictions: 18498870
bm_w<-BM(sales_w_ts,display = T) # show graphical view of results / display = True

summary(bm_w)
## Call: ( Standard Bass Model )
## 
##   BM(series = sales_w_ts, display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -90378130 -33009033 -13970791  -8913677  19483237  72394978 
## 
## Coefficients:
##        Estimate    Std.Error        Lower        Upper   p-value    
## m 4.694285e+09 9.071530e+07 4.516486e+09 4.872084e+09  2.04e-98 ***
## p 2.077484e-03 2.457719e-05 2.029314e-03 2.125654e-03 1.08e-129 ***
## q 1.989569e-02 5.380539e-04 1.884113e-02 2.095026e-02  7.05e-78 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  40676860  on  152  degrees of freedom
##  Multiple R-squared:   0.999528  Residual sum of squares:  2.515003e+17
bm_w$coefficients['m'] - sum(sales_w_ts)
##          m 
## 1178496087
# results are similar in terms of m, p and w are in other scale 
#because they are in different time stamp
bm_m$coefficients['q'] / bm_w$coefficients['q'] # they are approx 4 times
##        q 
## 4.506878
bm_m$coefficients['p'] / bm_w$coefficients['p'] # they are approx 4 times
##        p 
## 4.010805
# which makes sense

Coefficients are approximatly 4 times the ones of the monthly model, making sense because there are 4 weeks in a month. While market potential is similar.

# Prediction
pred_bm_w<- predict(bm_w, newx=c(1:length(sales_w_ts)))
pred_bm_w <- ts(pred_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_bm_w <- make.instantaneous(pred_bm_w)
pred.inst_bm_w <- ts(pred.inst_bm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))

# plot
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales")

# Add the fitted values as a line
lines(pred.inst_bm_w, col = "red", lwd = 2)

# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

# check residuals
res_bm_w <- sales_w_ts - pred.inst_bm_w
tsdisplay(res_bm_w)

Residuals have some structure and 2 lag has correlation, with clear trend and structure in the residuals

# RMSE
# Calculate RMSE for Bass Model predictions
rmse_bm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_bm_w)

# Print the RMSE
cat("RMSE for Bass Model Predictions:", rmse_bm_w, "\n")
## RMSE for Bass Model Predictions: 4542019
bm_d <- BM(
  sales_d_ts,
  prelimestimates = c(1.2 * sum(sales_d_ts), 0.005, 0.5), # Adjust these estimates
  display = TRUE
)

summary(bm_d)
## Call: ( Standard Bass Model )
## 
##   BM(series = sales_d_ts, prelimestimates = c(1.2 * sum(sales_d_ts), 
##     0.005, 0.5), display = TRUE)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -81086932 -29957254 -10704557  -7197438  17139516  79381813 
## 
## Coefficients:
##        Estimate    Std.Error        Lower        Upper p-value    
## m 4.631642e+09 3.177263e+07 4.569369e+09 4.693916e+09       0 ***
## p 3.258569e-04 1.314186e-06 3.232811e-04 3.284326e-04       0 ***
## q 2.847155e-03 2.754612e-05 2.793166e-03 2.901144e-03       0 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  34753453  on  1037  degrees of freedom
##  Multiple R-squared:   0.999642  Residual sum of squares:  1.252491e+18
bm_d$coefficients['m'] - sum(sales_d_ts)
##          m 
## 1178708770
# results are similar in terms of m, p and w are in other scale 
#because they are in different time stamp
bm_w$coefficients['q'] / bm_d$coefficients['q'] # they are approx 7 times
##        q 
## 6.987921
bm_w$coefficients['p'] / bm_d$coefficients['p'] # they are approx 7 times
##        p 
## 6.375449

Coefficients are approximately 1:7 scale of the ones in the weekly model, making sense. The market potential is also similar in order of magnitude.

# Prediction
pred_bm_d <- predict(bm_d, newx = c(1:length(sales_d_ts)))
pred_bm_d <- ts(pred_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))
pred.inst_bm_d <- make.instantaneous(pred_bm_d)
pred.inst_bm_d <- ts(pred.inst_bm_d, start = start(sales_d_ts), frequency = frequency(sales_d_ts))

# Plot actual vs fitted sales for daily data
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Daily)")

# Add the fitted values as a line
lines(pred.inst_bm_d, col = "red", lwd = 2)

# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

# Check residuals
res_bm_d <- sales_d_ts - pred.inst_bm_d
tsdisplay(res_bm_d)

Residuals don not seem stationary, or at least they have a lot of autocorrelation.

# Calculate RMSE for Bass Model predictions (daily data)
rmse_bm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred.inst_bm_d)

# Print the RMSE
cat("RMSE for Daily Bass Model Predictions:", rmse_bm_d, "\n")
## RMSE for Daily Bass Model Predictions: 1651896

Limitation of of the Bass Model

  • Bass model assumes that every product succeeds and the sales saturate to the steady state level. However, most new products fail in reality.

  • The market potential m is constant along the whole life cycle.

  • Bass model predictions works well only after the scale inflection point. if sales of a category goes up and up like a J-curve, it can over estimate the overall market size.

  • It is a model for products with a limited life cycle: needs a hypothesis.

  • Another drawback of Bass model is that the diffusion pattern in not affected by marketing mix variables like price or advertising.

The generalized Bass model extends the original Bass model allowing the roles of marketing mix value.

Generalized Bass Model

Bass model is used to forecast the adoption of a new product and to predict the sales, since it determines the shape of the curve of a model that represent the cumulative adoption of a new product. The Generalized Bass model extends the original Bass model by incorporating marketing mix variables. We can know the effect of pricing, promotions on the new product diffusion curve. It is more flexible than the original Bass model.

m <- 4.451570e+09
p <- 8.472917e-03
q <- 9.415625e-02

GBM_monthly_sales <- GBM(
  sales_m_ts, 
  shock = 'exp', 
  nshock = 1,
  #prelimestimates = c(m,p,q, 12, 0.1, -0.1)
  prelimestimates = c(m,p,q, 10, 0.1, 2)
  #prelimestimates = c(m,p,q, 11, 15, -0.1)
  )

summary(GBM_monthly_sales)
## Call: ( Generalized Bass model with 1  Exponential  shock )
## 
##   GBM(series = sales_m_ts, shock = "exp", nshock = 1, prelimestimates = c(m, 
##     p, q, 10, 0.1, 2))
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -27853732 -10753200   -127747   -619963   6529609  25477494 
## 
## Coefficients:
##            Estimate    Std.Error         Lower        Upper  p-value    
## m     1.004455e+10 1.683462e+10 -2.295071e+10 4.303980e+10 5.55e-01    
## p     2.360904e-03 3.666121e-03 -4.824562e-03 9.546370e-03 5.24e-01    
## q     4.120966e-02 6.359504e-02 -8.343433e-02 1.658536e-01 5.22e-01    
## a1    5.329896e+00 2.786829e-01  4.783688e+00 5.876104e+00 2.35e-18 ***
## b1   -9.260056e-02 1.382273e-01 -3.635212e-01 1.783200e-01 5.08e-01    
## c1    1.846342e+00 8.143514e-01  2.502422e-01 3.442441e+00 3.07e-02   *
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  13902026  on  30  degrees of freedom
##  Multiple R-squared:   0.999866  Residual sum of squares:  5.79799e+15
pred_GBM_monthly_sales<- predict(GBM_monthly_sales, newx=c(1:60))
pred_GBM_monthly_sales.inst<- make.instantaneous(pred_GBM_monthly_sales)

Guseo-Guidolin Model

# Montly model
ggm1 <- GGM(sales_m_ts, mt='base', display = T)
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced

## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
ggm2 <- GGM(sales_m_ts, mt= function(x) pchisq(x,10),display = T)

summary(ggm1)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_m_ts, mt = "base", display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -27544719  -8809035    742251   -337148   7701034  23238738 
## 
## Coefficients:
##           Estimate    Std.Error        Lower        Upper  p-value    
## K    9.208740e+09 2.454321e+09 4.398360e+09 1.401912e+10 7.24e-04 ***
## pc   2.617639e-02 3.002777e-03 2.029106e-02 3.206173e-02 7.64e-10 ***
## qc   2.453426e-01 3.079890e-02 1.849779e-01 3.057074e-01 5.41e-09 ***
## ps   6.782088e-03 1.329165e-03 4.176972e-03 9.387204e-03 1.60e-05 ***
## qs   3.849713e-02 8.943356e-03 2.096848e-02 5.602579e-02 1.56e-04 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  13318300  on  31  degrees of freedom
##  Multiple R-squared:   0.999873  Residual sum of squares:  5.498691e+15
summary(ggm2)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_m_ts, mt = function(x) pchisq(x, 10), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -55479254 -21799576   5818484  25726804  63528263 144216338 
## 
## Coefficients:
##          Estimate    Std.Error         Lower        Upper p-value  
## K   6.606326e+10 8.818450e+11 -1.662321e+12 1.794448e+12   0.941  
## ps  1.115796e-03 1.526548e-02 -2.880400e-02 3.103559e-02   0.942  
## qs  1.615629e-02 2.705220e-02 -3.686506e-02 6.917763e-02   0.554  
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  64933559  on  33  degrees of freedom
##  Multiple R-squared:   0.99678  Residual sum of squares:  1.391401e+17
# try different functions for market potential

ggm3 <- GGM(sales_m_ts, mt= function(x) log(x),display = T)

ggm4 <- GGM(sales_m_ts, mt= function(x) (x)**(1/1.05),display = T)

summary(ggm3)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_m_ts, mt = function(x) log(x), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -51543377 -17143219  -1444208  -3434599   7197387  38682408 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper  p-value    
## K   1.484699e+09 7.664004e+07 1.334488e+09 1.634911e+09 1.35e-19 ***
## ps  1.462118e-02 4.299105e-04 1.377857e-02 1.546379e-02 2.86e-27 ***
## qs  4.575437e-02 4.515590e-03 3.690397e-02 5.460476e-02 1.16e-11 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  23487767  on  33  degrees of freedom
##  Multiple R-squared:   0.999579  Residual sum of squares:  1.820528e+16
summary(ggm4)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_m_ts, mt = function(x) (x)^(1/1.05), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -33877044  -9900757   -986705  -1517515  10675797  29488516 
## 
## Coefficients:
##           Estimate    Std.Error         Lower         Upper  p-value    
## K    1.156377e+09 1.344628e+09 -1.479046e+09  3.791800e+09 3.96e-01    
## ps   7.688684e-03 9.373693e-03 -1.068342e-02  2.606078e-02 4.18e-01    
## qs  -7.001165e-02 1.984748e-03 -7.390169e-02 -6.612162e-02 8.84e-28 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  16905824  on  33  degrees of freedom
##  Multiple R-squared:   0.999782  Residual sum of squares:  9.431627e+15

K <- 7.683785e+09

pc <- 2.698613e-02

qc <- 2.582412e-01

ps <- 7.731763e-03

qs <- 4.508202e-02

# predictions
pred_ggm_m <- predict(ggm1, newx = c(1:length(sales_m_ts)))
pred_ggm_m <- ts(pred_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))
pred.inst_ggm_m <- make.instantaneous(pred_ggm_m)
pred.inst_ggm_m <- ts(pred.inst_ggm_m, start = start(sales_m_ts), frequency = frequency(sales_m_ts))

# Plot actual vs fitted sales for monthly data
plot(sales_m_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Month", ylab = "Monthly Sales", main = "Actual vs Fitted Sales (GGM Model)")

# Add the fitted values as a line
lines(pred.inst_ggm_m, col = "red", lwd = 2)

# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

#Analysis of residuals
res_GGM_m<- sales_m_ts - pred.inst_ggm_m
tsdisplay(res_GGM_m)

Residuals look stationary for this model

# Residuals somehow are kind of stationary
# check for stationarity of residuals
adf_test <- adf.test(res_GGM_m)
print(adf_test) # if p-val < alpha, series stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  res_GGM_m
## Dickey-Fuller = -4.1, Lag order = 3, p-value = 0.01708
## alternative hypothesis: stationary
# so with this model we achieve stationary series

# check for autocorrelation in residuals
Box.test(res_GGM_m, lag = 10, type = "Ljung-Box") # h0 res indep
## 
##  Box-Ljung test
## 
## data:  res_GGM_m
## X-squared = 16.263, df = 10, p-value = 0.09234
# p-val > alpha => fail to reject h0, so residuals seem indep

Residuals are likeley stationary

# Calculate RMSE for ggm1
rmse_ggm1 <- calculate_rmse(observed = sales_m_ts, predicted = pred.inst_ggm_m)

# Print RMSE for ggm1
cat("RMSE for GGM Model 1 (Base):", rmse_ggm1, "\n")
## RMSE for GGM Model 1 (Base): 11759505
# Weekly
ggm1_w <- GGM(sales_w_ts, mt='base', display = T)

ggm2_w <- GGM(sales_w_ts, mt= function(x) pchisq(x,25),display = T)

summary(ggm1_w) # this one is better
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_w_ts, mt = "base", display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -30941134  -8439496   1818273     88704   8469414  21113528 
## 
## Coefficients:
##           Estimate    Std.Error        Lower        Upper  p-value    
## K    6.454454e+09 5.261291e+08 5.423260e+09 7.485648e+09 2.13e-24 ***
## pc   3.542302e-04 2.760784e-05 3.001199e-04 4.083406e-04 6.66e-26 ***
## qc   2.078354e-02 1.326387e-03 1.818387e-02 2.338321e-02 2.16e-33 ***
## ps   1.149074e-02 3.238499e-04 1.085601e-02 1.212547e-02 7.53e-75 ***
## qs   3.110864e-02 2.164583e-03 2.686613e-02 3.535114e-02 5.35e-30 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  12271825  on  150  degrees of freedom
##  Multiple R-squared:   0.999876  Residual sum of squares:  2.258965e+16
summary(ggm2_w)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_w_ts, mt = function(x) pchisq(x, 25), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -34347002 -18821378   -462825   7370123  29484361  96549738 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper   p-value    
## K   5.300184e+09 1.334365e+08 5.038653e+09 5.561715e+09  3.66e-82 ***
## ps  2.098620e-03 2.848830e-05 2.042784e-03 2.154456e-03 7.94e-121 ***
## qs  1.651520e-02 5.058330e-04 1.552378e-02 1.750661e-02  1.41e-70 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  33658058  on  152  degrees of freedom
##  Multiple R-squared:   0.999055  Residual sum of squares:  1.721955e+17
# try different functions for market potential

ggm3_w <- GGM(sales_w_ts, mt= function(x) log(x),display = T)

ggm4_w <- GGM(sales_w_ts, mt= function(x) (x)**(1/1.05),display = T)

summary(ggm3_w)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_w_ts, mt = function(x) log(x), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -62209555 -22307652  -3737433  -4861230  11043715  47916399 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper   p-value    
## K   9.913091e+08 1.987146e+07 9.523618e+08 1.030256e+09  3.92e-96 ***
## ps  3.072264e-03 3.266103e-05 3.008250e-03 3.136279e-03 1.29e-136 ***
## qs  1.365598e-02 4.799543e-04 1.271528e-02 1.459667e-02  9.15e-63 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  26307052  on  152  degrees of freedom
##  Multiple R-squared:   0.999423  Residual sum of squares:  1.051933e+17
summary(ggm4_w) # better shaped but less significant
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_w_ts, mt = function(x) (x)^(1/1.05), display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -34605136 -10571355   2199183   -413132   8482721  26917353 
## 
## Coefficients:
##           Estimate    Std.Error         Lower         Upper   p-value    
## K    9.122958e+07 1.292809e+07  6.589099e+07  1.165682e+08  5.66e-11 ***
## ps   6.642281e-03 1.054715e-03  4.575079e-03  8.709484e-03  3.09e-09 ***
## qs  -1.898405e-02 2.042294e-04 -1.938433e-02 -1.858377e-02 7.61e-136 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  14257525  on  152  degrees of freedom
##  Multiple R-squared:   0.99983  Residual sum of squares:  3.089811e+16
# predictions
pred_ggm_w <- predict(ggm1_w, newx = c(1:length(sales_w_ts)))
pred_ggm_w <- ts(pred_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))
pred.inst_ggm_w <- make.instantaneous(pred_ggm_w)
pred.inst_ggm_w <- ts(pred.inst_ggm_w, start = start(sales_w_ts), frequency = frequency(sales_w_ts))

# Plot actual vs fitted sales for weekly data
plot(sales_w_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Week", ylab = "Weekly Sales", main = "Actual vs Fitted Sales (GGM Model)")

# Add the fitted values as a line
lines(pred.inst_ggm_w, col = "red", lwd = 2)

# Add a legend
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM)"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

# Analysis of residuals
res_GGM_w <- sales_w_ts - pred.inst_ggm_w
tsdisplay(res_GGM_w)

# Check for stationarity of residuals
adf_test_w <- adf.test(res_GGM_w)
## Warning in adf.test(res_GGM_w): p-value smaller than printed p-value
print(adf_test_w) # if p-value < alpha, series is stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  res_GGM_w
## Dickey-Fuller = -4.3175, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
# Check for autocorrelation in residuals
box_test_w <- Box.test(res_GGM_w, lag = 10, type = "Ljung-Box")
print(box_test_w) # if p-value > alpha, residuals are independent
## 
##  Box-Ljung test
## 
## data:  res_GGM_w
## X-squared = 55.579, df = 10, p-value = 2.462e-08

Series is stationary according to tests, but clearly has strong autocorrelation

# RMSE
rmse_ggm_w <- calculate_rmse(observed = sales_w_ts, predicted = pred.inst_ggm_w)

# Print the RMSE
cat("RMSE for Weekly GGM Model Predictions:", rmse_ggm_w, "\n")
## RMSE for Weekly GGM Model Predictions: 3453199
# Daily GGM
# Scaling the sales data
sales_min <- min(sales_d_ts)
sales_max <- max(sales_d_ts)
sales_scaled <- (sales_d_ts - sales_min) / (sales_max - sales_min)

# View scaled data
summary(sales_scaled)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1904  0.2867  0.3217  0.4343  1.0000
plot(sales_scaled, type = "l", main = "Scaled Daily Sales", xlab = "Day", ylab = "Scaled Sales")

We re-scale the data because else the model won’t converge

# Fit GGM models using scaled data
ggm1_d <- GGM(sales_scaled, mt = 'base', display = T)
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced

## Warning in sqrt((1 - exp(-(pc + qc) * t))/(1 + (qc/pc) * exp(-(pc + qc) * :
## NaNs produced
ggm2_d <- GGM(sales_scaled, mt = function(x) pchisq(x, 10), display = T)

ggm3_d <- GGM(sales_scaled, mt = function(x) log(x), display = T)

ggm4_d <- GGM(sales_scaled, mt = function(x) (x)^(1/1.05), display = T)

# Summarize models
summary(ggm1_d)  # Base model
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_scaled, mt = "base", display = T)
## 
## Residuals:
##      Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.40144 -0.89220  0.18363  0.04369  0.98854  2.48411 
## 
## Coefficients:
##           Estimate    Std.Error         Lower        Upper   p-value    
## K    1.174310e+03 4.487889e+02 294.699957932 2.053920e+03  9.01e-03  **
## pc   2.384956e-05 1.641538e-05  -0.000008324 5.602311e-05  1.47e-01    
## qc   2.113894e-03 1.971344e-04   0.001727518 2.500270e-03  1.64e-25 ***
## ps   1.722612e-03 5.791737e-05   0.001609096 1.836128e-03 5.37e-141 ***
## qs   2.948140e-03 1.218250e-04   0.002709367 3.186912e-03 6.84e-103 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  1.202014  on  1035  degrees of freedom
##  Multiple R-squared:   0.999862  Residual sum of squares:  1495.407
summary(ggm2_d)  # Chi-squared
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_scaled, mt = function(x) pchisq(x, 10), display = T)
## 
## Residuals:
##     Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -7.8650 -2.9043 -0.9269 -0.6900  1.6614  7.6964 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper p-value    
## K   4.487566e+02 3.078722e+00 4.427224e+02 4.547908e+02       0 ***
## ps  3.257391e-04 1.314285e-06 3.231632e-04 3.283151e-04       0 ***
## qs  2.848319e-03 2.756036e-05 2.794302e-03 2.902336e-03       0 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  3.369694  on  1037  degrees of freedom
##  Multiple R-squared:   0.998917  Residual sum of squares:  11774.97
summary(ggm3_d)  # Log transformation
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_scaled, mt = function(x) log(x), display = T)
## 
## Residuals:
##     Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -6.0614 -2.1300 -0.4001 -0.4470  1.0521  5.8832 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper p-value    
## K   6.700548e+01 4.568457e-01 66.110077198 6.790088e+01       0 ***
## ps  4.296123e-04 1.568576e-06  0.000426538 4.326867e-04       0 ***
## qs  2.228507e-03 2.530593e-05  0.002178908 2.278106e-03       0 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  2.489536  on  1037  degrees of freedom
##  Multiple R-squared:   0.999409  Residual sum of squares:  6427.109
summary(ggm4_d)  # Power transformation
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_scaled, mt = function(x) (x)^(1/1.05), display = T)
## 
## Residuals:
##      Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.33513 -0.98706  0.18913  0.07835  0.93237  3.00796 
## 
## Coefficients:
##          Estimate    Std.Error        Lower        Upper   p-value    
## K    0.963478949 3.373281e-02  0.897363853  1.029594046 7.67e-133 ***
## ps   0.001651335 6.856616e-05  0.001516948  0.001785722 3.77e-102 ***
## qs  -0.003188738 1.158087e-05 -0.003211436 -0.003166040  0.00e+00 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  1.307058  on  1037  degrees of freedom
##  Multiple R-squared:   0.999837  Residual sum of squares:  1771.61

Predict on the best model based on fit and p-values We select model 1

# Prediction using GGM model
pred_ggm_d <- predict(ggm1_d, newx = c(1:length(sales_scaled)))
pred_ggm_d <- ts(pred_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))
pred.inst_ggm_d <- make.instantaneous(pred_ggm_d)
pred.inst_ggm_d <- ts(pred.inst_ggm_d, start = start(sales_scaled), frequency = frequency(sales_scaled))

# Re-scale predictions back to the original scale
pred_original_scale <- (pred.inst_ggm_d * (sales_max - sales_min)) + sales_min

# Plot actual vs fitted sales (original scale)
plot(sales_d_ts, type = "p", col = "black", pch = 16, cex = 0.7,
     xlab = "Day", ylab = "Daily Sales", main = "Actual vs Fitted Sales (Original Scale)")
lines(pred_original_scale, col = "red", lwd = 2)
legend("topleft", legend = c("Actual Values", "Fitted Values (GGM, Original Scale)"),
       col = c("black", "red"), pch = c(16, NA), lty = c(NA, 1), lwd = c(NA, 2))

# Analysis of residuals
res_GGM_d <- sales_d_ts - pred_original_scale
tsdisplay(res_GGM_d, main = "Residuals of GGM Model")

Residuals dont look stationary

# Check for stationarity of residuals
adf_test_d <- adf.test(res_GGM_d)
## Warning in adf.test(res_GGM_d): p-value smaller than printed p-value
print(adf_test_d)  # If p-value < alpha, series is stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  res_GGM_d
## Dickey-Fuller = -7.8042, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# according to this, they are stationary

# Check for autocorrelation in residuals
box_test_d <- Box.test(res_GGM_d, lag = 10, type = "Ljung-Box")
print(box_test_d)  # If p-value > alpha, residuals are indep
## 
##  Box-Ljung test
## 
## data:  res_GGM_d
## X-squared = 1403.5, df = 10, p-value < 2.2e-16

Residuals look stationary in the test but hey have serial correlation

# Calculate RMSE for GGM model predictions (original scale)
rmse_ggm_d <- calculate_rmse(observed = sales_d_ts, predicted = pred_original_scale)

# Print the RMSE
cat("RMSE for Daily GGM Model Predictions (Original Scale):", rmse_ggm_d, "\n")
## RMSE for Daily GGM Model Predictions (Original Scale): 1600510

Holt-Winters Model

# adjust timeseries to ensure date consistency
sales_m_ts <- ts(sales_m_ts, frequency=12, start=c(2021, 11))

hw1_m<- hw(sales_m_ts, seasonal="additive")
hw2_m<- hw(sales_m_ts, seasonal="multiplicative")

# prediction
fitted_hw1 <- hw1_m$fitted
fitted_hw2 <- hw2_m$fitted

We now plot the models

# Create a data frame for ggplot
plot_data <- data.frame(
  Time = time(sales_m_ts),
  Actual = as.numeric(sales_m_ts),
  Fitted_Additive = as.numeric(hw1_m$fitted),
  Fitted_Multiplicative = as.numeric(hw2_m$fitted)
)

# Melt data for easier ggplot usage
library(reshape2)
plot_data_melted <- melt(plot_data, id.vars = "Time", 
                         variable.name = "Series", 
                         value.name = "Value")

# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
  geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) + # Actual values as dots
  geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) +  # Fitted values as lines
  labs(
    title = "Actual vs Fitted Values",
    x = "Time",
    y = "Value",
    color = "Series"
  ) +
  scale_color_manual(
    values = c("Actual" = "black", "Fitted_Additive" = "blue", "Fitted_Multiplicative" = "red"),
    labels = c("Actual", "Fitted (Additive)", "Fitted (Multiplicative)")
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_text(face = "bold")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Looks like the multiplicative models follows the data more more closely in general.

# residuals
residuals_hw1 <- residuals(hw1_m)  
residuals_hw2 <- residuals(hw2_m)  
tsdisplay(residuals_hw1)

tsdisplay(residuals_hw2)

# Stationarity and Correlation
# check for stationarity of residuals
# additive
adf_test <- adf.test(residuals_hw1) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals_hw1
## Dickey-Fuller = -1.8495, Lag order = 3, p-value = 0.6317
## alternative hypothesis: stationary
# so with this model we achieve stationary series
# multiplicative
adf_test <- adf.test(residuals_hw2) # H0: series is non-stationary
print(adf_test) # if p-val < alpha, series not stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals_hw2
## Dickey-Fuller = -2.0941, Lag order = 3, p-value = 0.5365
## alternative hypothesis: stationary
# so with this model we achieve stationary series

# additive
# check for autocorrelation in residuals
Box.test(residuals_hw1, lag = 10, type = "Ljung-Box") # h0 res indep
## 
##  Box-Ljung test
## 
## data:  residuals_hw1
## X-squared = 4.8498, df = 10, p-value = 0.901
# p-val > alpha =>  Dont reject h0, so residuals are indep

# additive
# check for autocorrelation in residuals
Box.test(residuals_hw2, lag = 10, type = "Ljung-Box") # h0 res indep
## 
##  Box-Ljung test
## 
## data:  residuals_hw2
## X-squared = 10.493, df = 10, p-value = 0.3983
# p-val > alpha =>  Dont reject h0, so residuals are indep

Multiplicative model follows the data better and has slightly better residuals

# forecast
# save the forecast of the second model
forecast_hw1 <- forecast(hw1_m, h=12)
forecast_hw2 <- forecast(hw2_m, h=12)

# Forecast plot
# Plot the time series with both forecasts
autoplot(sales_m_ts) +
  autolayer(forecast_hw1$mean, series="Additive Holt-Winters Forecast", PI=F) +
  autolayer(forecast_hw2$mean, series="Multiplicative Holt-Winters Forecast", PI=F) +
  ggtitle("Sales Forecast with Holt-Winters Models") +
  xlab("Time") +
  ylab("Sales") +
  scale_color_manual(
    values=c("Additive Holt-Winters Forecast" = "blue",
             "Multiplicative Holt-Winters Forecast" = "red")
  ) +
  theme_minimal() +
  theme(legend.position = "top", legend.title = element_blank())
## Warning in ggplot2::geom_line(ggplot2::aes(x = .data[["timeVal"]], y = .data[["seriesVal"]], : Ignoring unknown parameters: `PI`
## Ignoring unknown parameters: `PI`

# RMSE Calculation for Holt-Winters models
rmse_hw1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw1)
rmse_hw2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_hw2)

# Print RMSE values
cat("RMSE for Additive Holt-Winters Model:", rmse_hw1, "\n")
## RMSE for Additive Holt-Winters Model: 14123520
cat("RMSE for Multiplicative Holt-Winters Model:", rmse_hw2, "\n")
## RMSE for Multiplicative Holt-Winters Model: 13169921

Multiplicative model is better

The Holt winters model has a frequency limit of 24, so we cannot do larger than that. Weekly and daily data have 52 and 365 frequencies respectively so we cannot fit the model with the R implementation so far.

ARIMA models

ARIMA is a acronym for Auto Regressive Integrated Moving Average, ARIMA (p,d,q) where p refers to the AR part, q refers to the MA part and d is the degree of first difference involved.

First we nwwd to check if the series is stationary

# see if series is stationary
adf.test(sales_m_ts) #H0, series is non-stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales_m_ts
## Dickey-Fuller = -2.655, Lag order = 3, p-value = 0.3183
## alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_m_ts)) #H0, series is non-stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(sales_m_ts)
## Dickey-Fuller = -3.3757, Lag order = 3, p-value = 0.07725
## alternative hypothesis: stationary
# see the acf and pacf
tsdisplay(diff(sales_m_ts))

Monthly sales

plot(sales_m_ts)

ndiffs(sales_m_ts)
## [1] 1
tsdisplay(diff(sales_m_ts))

Correlogram plot maybe suggests AR-1 or MA-1 after first difference

# ARIMA(p,d,q) = (1,1,0)
arima1_m<- Arima(sales_m_ts, order=c(1,1,0))
summary(arima1_m)
## Series: sales_m_ts 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.3178
## s.e.   0.1783
## 
## sigma^2 = 2.666e+14:  log likelihood = -630.5
## AIC=1265   AICc=1265.38   BIC=1268.11
## 
## Training set error measures:
##                   ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 4664178 15867282 12399192 6.652001 14.52265 0.3786368 -0.07165611
# study residual to see if is a good model
resid1_m<- residuals(arima1_m)
tsdisplay(resid1_m)

Residuals look stationary after fitting ARIMA

auto_arima_m <- auto.arima(sales_m_ts)
auto_arima_m
## Series: sales_m_ts 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##           ma1    drift
##       -0.3741  3475637
## s.e.   0.1604  1659940
## 
## sigma^2 = 2.494e+14:  log likelihood = -628.83
## AIC=1263.66   AICc=1264.44   BIC=1268.33
autoplot(forecast(auto_arima_m))

checkresiduals(auto_arima_m)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 0.42136, df = 6, p-value = 0.9987
## 
## Model df: 1.   Total lags used: 7

The residuals of the Autoarima look stationary

AIC of the the manual arima is 1265, while the one of the autoarima is 1263. Lets use the autoarima

# Fitted values from both models
fitted_auto_arima <- fitted(auto_arima_m)
fitted_arima1 <- fitted(arima1_m)

# Create a data frame for plotting
plot_data <- data.frame(
  Time = time(sales_m_ts),
  Actual = as.numeric(sales_m_ts),
  Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima),
  Fitted_ARIMA1 = as.numeric(fitted_arima1)
)

# Melt the data frame
plot_data_melted <- melt(plot_data, id.vars = "Time", 
                         variable.name = "Series", 
                         value.name = "Value")

# Plot

ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
  geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) +  # Actual values as points
  geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) +   # Fitted values as lines
  labs(
    title = "Actual vs Fitted Values for ARIMA Models",
    x = "Time",
    y = "Sales",
    color = "Series"
  ) +
  scale_color_manual(
    values = c("Actual" = "black", "Fitted_Auto_ARIMA" = "blue", "Fitted_ARIMA1" = "red"),
    labels = c("Actual", "Fitted (Auto ARIMA)", "Fitted (ARIMA(1,1,0))")
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_blank()
  )

# Calculate RMSE for the fitted values
# Calculate RMSE for each model
rmse_auto_arima <- calculate_rmse(observed = sales_m_ts, predicted = fitted_auto_arima)
rmse_arima1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_arima1)

# Print RMSE values
cat("RMSE for Auto ARIMA Model:", rmse_auto_arima, "\n")
## RMSE for Auto ARIMA Model: 15118942
cat("RMSE for ARIMA(1,1,0) Model:", rmse_arima1, "\n")
## RMSE for ARIMA(1,1,0) Model: 15867282

The RMSE of the Autoarima is better as is the AIC.

The ARIMA(0,1,1) model can be described simply as a random walk with drift. Here’s what that means:

  1. AR (AutoRegressive) Part:

    • The first number, 0, indicates the order of the autoregressive part. In this case, it means there are no autoregressive terms (i.e., the model does not use past values of the series to predict future values).
  2. I (Integrated) Part:

    • The second number, 1, indicates the degree of differencing required to make the time series stationary. Differencing is a technique used to remove trends and seasonality from the series. A value of 1 means the series is differenced once.
  3. MA (Moving Average) Part:

    • The third number, 1, indicates the order of the moving average part.

An ARIMA(0,1,1) model is suitable when:

The d=1 parameter in ARIMA(0,1,1) indicates that the series is differenced once to achieve stationarity. Before differencing, the series may exhibit a linear trend or random walk behavior. After differencing, the series should show no trend and have relatively stable mean and variance

The q=1 in ARIMA(0,1,1) indicates that the series is modeled with a first-order moving average component after differencing. The autocorrelation function (ACF) of the differenced series should show: A significant spike at lag 1. Rapid decay to zero after lag 1. The partial autocorrelation function (PACF) should show no significant lags.

# study residual to see if is a good model
resid_autoarima_m<- residuals(auto_arima_m)
tsdisplay(resid_autoarima_m)

Weekly sales

# see if series is stationary
adf.test(sales_w_ts) #H0, series is non-stationary
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales_w_ts
## Dickey-Fuller = -2.7572, Lag order = 5, p-value = 0.2607
## alternative hypothesis: stationary
# p-val > 0.05 => dont reject, non stationary: series is not stationary
adf.test(diff(sales_w_ts)) # after diff is sationary
## Warning in adf.test(diff(sales_w_ts)): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(sales_w_ts)
## Dickey-Fuller = -6.5801, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

After differencing, looks stationary

tsdisplay(diff(sales_w_ts))

Correlograms suggest maybe AR 1 or MA 1.

### Manual ARIMA------------
# ARIMA(p,d,q) = (1,1,0)
arima1_w<- Arima(sales_w_ts, order=c(1,1,0))
summary(arima1_w)
## Series: sales_w_ts 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.4393
## s.e.   0.0742
## 
## sigma^2 = 1.161e+13:  log likelihood = -2534.49
## AIC=5072.98   AICc=5073.06   BIC=5079.06
## 
## Training set error measures:
##                    ME    RMSE     MAE        MPE     MAPE      MASE        ACF1
## Training set 193180.2 3385012 2537492 -0.1816435 13.03938 0.3404022 -0.04968945
auto_arima_w <- auto.arima(sales_w_ts)
summary(auto_arima_w)
## Series: sales_w_ts 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.5426
## s.e.   0.0758
## 
## sigma^2 = 1.122e+13:  log likelihood = -2531.96
## AIC=5067.91   AICc=5067.99   BIC=5073.99
## 
## Training set error measures:
##                    ME    RMSE     MAE       MPE     MAPE      MASE        ACF1
## Training set 335675.7 3328293 2462721 0.6620165 12.38346 0.3303717 -0.01238358

AIC on the Autoarima is better, lets go with that one

checkresiduals(auto_arima_w)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 17.171, df = 30, p-value = 0.9705
## 
## Model df: 1.   Total lags used: 31

Residuals look stationary, see the plots for both models

# Fit ARIMA models for weekly data
arima1_w <- Arima(sales_w_ts, order = c(1, 1, 0))
auto_arima_w <- auto.arima(sales_w_ts)

# Extract fitted values for both models
fitted_arima1_w <- fitted(arima1_w)
fitted_auto_arima_w <- fitted(auto_arima_w)

# Create a data frame for plotting
plot_data <- data.frame(
  Time = time(sales_w_ts),
  Actual = as.numeric(sales_w_ts),
  Fitted_ARIMA1 = as.numeric(fitted_arima1_w),
  Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_w)
)

# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time", 
                         variable.name = "Series", 
                         value.name = "Value")

# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
  geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) +  # Actual values as points
  geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) +   # Fitted values as lines
  labs(
    title = "Actual vs Fitted Values for ARIMA Models (Weekly)",
    x = "Time",
    y = "Sales",
    color = "Series"
  ) +
  scale_color_manual(
    values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
    labels = c("Actual", "Fitted (ARIMA(1,1,0))", "Fitted (Auto ARIMA)")
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_blank()
  )

# Calculate RMSE for both models
rmse_arima1_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_arima1_w)
rmse_auto_arima_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_auto_arima_w)

# Print RMSE values
cat("RMSE for ARIMA(1,1,0) Model (Weekly):", rmse_arima1_w, "\n")
## RMSE for ARIMA(1,1,0) Model (Weekly): 3385012
cat("RMSE for Auto ARIMA Model (Weekly):", rmse_auto_arima_w, "\n")
## RMSE for Auto ARIMA Model (Weekly): 3328293

The Auto-arima is also better in terms of RMSE

Daily sales

# see if series is stationary
adf.test(sales_d_ts) #H0, series is non-stationary
## Warning in adf.test(sales_d_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales_d_ts
## Dickey-Fuller = -5.6283, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# p-val < 0.05 =>  reject non stationary: series might be stationary

No need for differencing because is already stationary, try to model with arima

tsdisplay(sales_d_ts)

Autocorrelograms are not easy to interpret, but lets try with a baseline model

# ARIMA(p,d,q) = (2,1,0)
arima1_d<- Arima(sales_d_ts, order=c(1,0,1))
summary(arima1_d)
## Series: sales_d_ts 
## ARIMA(1,0,1) with non-zero mean 
## 
## Coefficients:
##          ar1     ma1        mean
##       0.3253  0.3666  3320965.47
## s.e.  0.0434  0.0400    92676.64
## 
## sigma^2 = 2.184e+12:  log likelihood = -16248.67
## AIC=32505.35   AICc=32505.39   BIC=32525.14
## 
## Training set error measures:
##                    ME    RMSE     MAE      MPE     MAPE      MASE      ACF1
## Training set 545.5599 1475571 1180971 -110.111 132.9826 0.5165637 0.0150054
checkresiduals(arima1_d)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with non-zero mean
## Q* = 2743.8, df = 206, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 208

Residuals look not entirely stationary

Try to model with automatic approach:

auto_arima_d <- auto.arima(sales_d_ts)
summary(auto_arima_d)
## Series: sales_d_ts 
## ARIMA(5,1,5) with drift 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5      ma1     ma2      ma3
##       0.9544  -1.5459  1.0021  -1.0850  0.1718  -1.6378  1.8605  -1.7958
## s.e.  0.0587   0.0493  0.0822   0.0449  0.0505   0.0495  0.0774   0.0845
##          ma4      ma5     drift
##       1.3246  -0.6504  3652.866
## s.e.  0.0733   0.0363  2319.631
## 
## sigma^2 = 1.213e+12:  log likelihood = -15926.04
## AIC=31876.08   AICc=31876.38   BIC=31935.43
## 
## Training set error measures:
##                     ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -1608.767 1094980 810378.5 -49.07305 67.93344 0.3544643
##                     ACF1
## Training set 0.003023956
checkresiduals(auto_arima_d)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,5) with drift
## Q* = 189.3, df = 198, p-value = 0.6592
## 
## Model df: 10.   Total lags used: 208

Rresiduals improve, and AIC is lower in the autoarima Check the fit for both models

# Extract fitted values for both models
fitted_arima1_d <- fitted(arima1_d)
fitted_auto_arima_d <- fitted(auto_arima_d)

# Create a data frame for plotting
plot_data <- data.frame(
  Time = time(sales_d_ts),
  Actual = as.numeric(sales_d_ts),
  Fitted_ARIMA1 = as.numeric(fitted_arima1_d),
  Fitted_Auto_ARIMA = as.numeric(fitted_auto_arima_d)
)

# Melt the data frame for ggplot2
plot_data_melted <- melt(plot_data, id.vars = "Time", 
                         variable.name = "Series", 
                         value.name = "Value")

# Plot using ggplot2
ggplot(plot_data_melted, aes(x = Time, y = Value, color = Series)) +
  geom_point(data = subset(plot_data_melted, Series == "Actual"), size = 2) +  # Actual values as points
  geom_line(data = subset(plot_data_melted, Series != "Actual"), size = 1) +   # Fitted values as lines
  labs(
    title = "Actual vs Fitted Values for ARIMA Models (Daily)",
    x = "Time",
    y = "Sales",
    color = "Series"
  ) +
  scale_color_manual(
    values = c("Actual" = "black", "Fitted_ARIMA1" = "red", "Fitted_Auto_ARIMA" = "blue"),
    labels = c("Actual", "Fitted (ARIMA(1,0,1))", "Fitted (Auto ARIMA)")
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_blank()
  )

Plot is not readable, but check the RMSE for both models to confirm wihch fits better

# Calculate RMSE for both models
rmse_arima1_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_arima1_d)
rmse_auto_arima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_auto_arima_d)

# Print RMSE values
cat("RMSE for ARIMA(1,0,1) Model (Daily):", rmse_arima1_d, "\n")
## RMSE for ARIMA(1,0,1) Model (Daily): 1475571
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980

Autoarima is much better, now try to improve with seasonality, beacuse daily data looks seasonal each 7 days.

SARIMA

# Daily sales
tsdisplay(sales_d_ts) # 

tsdisplay(diff(sales_d_ts))

sarima_d <- auto.arima(sales_d_ts, seasonal=TRUE)
summary(sarima_d)
## Series: sales_d_ts 
## ARIMA(5,1,5) with drift 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5      ma1     ma2      ma3
##       0.9544  -1.5459  1.0021  -1.0850  0.1718  -1.6378  1.8605  -1.7958
## s.e.  0.0587   0.0493  0.0822   0.0449  0.0505   0.0495  0.0774   0.0845
##          ma4      ma5     drift
##       1.3246  -0.6504  3652.866
## s.e.  0.0733   0.0363  2319.631
## 
## sigma^2 = 1.213e+12:  log likelihood = -15926.04
## AIC=31876.08   AICc=31876.38   BIC=31935.43
## 
## Training set error measures:
##                     ME    RMSE      MAE       MPE     MAPE      MASE
## Training set -1608.767 1094980 810378.5 -49.07305 67.93344 0.3544643
##                     ACF1
## Training set 0.003023956
resid_ds<- residuals(sarima_d)
tsdisplay(resid_ds)

# check for autocorrelation
Box.test(residuals(sarima_d), type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(sarima_d)
## X-squared = 0.0095375, df = 1, p-value = 0.9222
# A low p-value (<0.05) suggests residual autocorrelation.

Looks like Aarima is the same in terms of AIC, lets check the RMSE:

# Extract fitted values for both models

fitted_sarima_d <- fitted(sarima_d)

# Calculate RMSE for both models
rmse_sarima_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarima_d)

# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
## RMSE for Seasonal ARIMA Model (Daily): 1094980

The RMSE is exactly the same, they are the same model.

SARIMAX

Refine SARIMA with external regressors

# readefine sales_d_ts
head(df_merged_d)
sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=365, start=c(2021, 334))  # 334 is November 30
seasonal_sales_d_ts <- ts(exp(df_merged_d$sales_cop), frequency=7, start=c(2021, 334))  # 334 is November 30
plot(sales_d_ts)

tsdisplay(sales_d_ts,lag.max = 30)

tsdisplay(seasonal_sales_d_ts,lag.max = 30)

# define regresors
# Select specific columns by name
x_regressors_d <- df_merged_d %>% select(rain_sum, fx, tmedian)
# Apply the exponential function to each column
x_regressors_d <- as.data.frame(apply(x_regressors_d, 2, exp))
# Convert to a matrix for ARIMA modeling
x_regressors_d <- as.matrix(x_regressors_d)
# fit the model on sales
# Fit an auto.arima model with seasonal component and external regressors
sarimax_model_d <- auto.arima(
  sales_d_ts,
  seasonal = TRUE,               # Enable seasonal components
  xreg = x_regressors_d          # External regressors
)

# Display the summary of the fitted model
summary(sarimax_model_d)
## Series: sales_d_ts 
## Regression with ARIMA(5,1,5) errors 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5      ma1     ma2      ma3
##       0.9230  -1.5215  0.9550  -1.0569  0.1394  -1.6554  1.8823  -1.8168
## s.e.  0.0605   0.0502  0.0831   0.0448  0.0509   0.0516  0.0805   0.0836
##          ma4      ma5     drift  rain_sum         fx  tmedian
##       1.3447  -0.6535  3880.584   38.8494  -375.8824  62715.8
## s.e.  0.0730   0.0373  2346.036  112.5507   383.7332  30430.2
## 
## sigma^2 = 1.333e+12:  log likelihood = -15973.62
## AIC=31977.25   AICc=31977.72   BIC=32051.44
## 
## Training set error measures:
##                     ME    RMSE    MAE      MPE    MAPE      MASE        ACF1
## Training set -2264.148 1146332 833227 -3414423 3414442 0.3598772 0.000552024

The AIC actually decreases, lets check the RMSE

# Extract fitted values for all models
fitted_sarimax_d <- fitted(sarimax_model_d)

# Calculate RMSE for all models
rmse_sarimax_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_sarimax_d)

# Print RMSE values
cat("RMSE for Auto ARIMA Model (Daily):", rmse_auto_arima_d, "\n")
## RMSE for Auto ARIMA Model (Daily): 1094980
cat("RMSE for Seasonal ARIMA Model (Daily):", rmse_sarima_d, "\n")
## RMSE for Seasonal ARIMA Model (Daily): 1094980
cat("RMSE for SARIMAX Model (Daily):", rmse_sarimax_d, "\n")
## RMSE for SARIMAX Model (Daily): 1146332

The RMSE also worsens, so stay with regular Auto-ARIMA

Exponential Smoothing methods

Simple Exponential Smoothing

Monthly Sales

Monthly Sales

fit_m1 <- ses(sales_m_ts, alpha = 0.2, initial = 'simple', h=5)
fit_m2 <- ses(sales_m_ts, alpha = 0.6, initial = 'simple', h=5)
fit_m3 <- ses(sales_m_ts, h=5)

plot(sales_m_ts, ylab='Monthly Sales', xlab='Months')
lines(fitted(fit_m1), col='blue', type='o')
lines(fitted(fit_m2), col='red', type='o')
lines(fitted(fit_m3), col='green', type='o')

forecast_m1 <- ses(sales_m_ts, h=5)

# Accuracy of one-step-ahead training errors
round(accuracy(forecast_m1),2)
##                   ME     RMSE      MAE   MPE MAPE MASE  ACF1
## Training set 3207582 17626876 13250973 -12.1 29.7  0.4 -0.05
summary(forecast_m1)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = sales_m_ts, h = 5)
## 
##   Smoothing parameters:
##     alpha = 0.675 
## 
##   Initial states:
##     l = 56978123.3871 
## 
##   sigma:  18137906
## 
##      AIC     AICc      BIC 
## 1336.322 1337.072 1341.073 
## 
## Error measures:
##                   ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 3207582 17626876 13250973 -12.09656 29.70395 0.4046478 -0.0504105
## 
## Forecasts:
##          Point Forecast     Lo 80     Hi 80    Lo 95     Hi 95
## Nov 2024      134921433 111676771 158166094 99371791 170471075
## Dec 2024      134921433 106877020 162965845 92031205 177811660
## Jan 2025      134921433 102786345 167056520 85775057 184067808
## Feb 2025      134921433  99160579 170682286 80229927 189612939
## Mar 2025      134921433  95870012 173972853 75197439 194645426
autoplot(forecast_m1) + autolayer(fitted(forecast_m1),series='Fitted') + ylab("Monthly Sales")+xlab("Months")

# Extract fitted values for each model
fitted_m1 <- fitted(fit_m1)
fitted_m2 <- fitted(fit_m2)
fitted_m3 <- fitted(fit_m3)

# Calculate RMSE for each model
rmse_m1 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m1)
rmse_m2 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m2)
rmse_m3 <- calculate_rmse(observed = sales_m_ts, predicted = fitted_m3)

# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_m1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 23924620
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_m2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 16189623
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_m3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 17626876
rmse_exp_sm_m <- rmse_m2
Weekly Sales

Weekly Sales

For weekly data, exponential smoothing can capture longer-term trends and seasonal patterns that repeat on a weekly basis. Weekly data can also have seasonal components related to months, quarters, or years.

fit_w1 <- ses(sales_w_ts, alpha = 0.2, initial = 'simple', h=5)
fit_w2 <- ses(sales_w_ts, alpha = 0.6, initial = 'simple', h=5)
fit_w3 <- ses(sales_w_ts, h=5)

plot(sales_w_ts, ylab='Weekly Sales', xlab='Weeks')
lines(fitted(fit_w1), col='blue', type='o')
lines(fitted(fit_w2), col='red', type='o')
lines(fitted(fit_w3), col='green', type='o')

forecast_w1 <- ses(sales_w_ts, h=5)
round(accuracy(forecast_w1),2)
##                    ME    RMSE     MAE   MPE  MAPE MASE  ACF1
## Training set 306073.8 3331847 2478469 -0.46 13.08 0.33 -0.01
summary(forecast_w1)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = sales_w_ts, h = 5)
## 
##   Smoothing parameters:
##     alpha = 0.4502 
## 
##   Initial states:
##     l = 5339607.7196 
## 
##   sigma:  3353553
## 
##      AIC     AICc      BIC 
## 5443.633 5443.791 5452.763 
## 
## Error measures:
##                    ME    RMSE     MAE        MPE     MAPE      MASE
## Training set 306073.8 3331847 2478469 -0.4636447 13.08087 0.3324843
##                      ACF1
## Training set -0.006458446
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2024.865       26697597 22399845 30995348 20124753 33270441
## 2024.885       26697597 21984397 31410797 19489379 33905814
## 2024.904       26697597 21602713 31792480 18905645 34489549
## 2024.923       26697597 21247696 32147498 18362692 35032501
## 2024.942       26697597 20914431 32480762 17853008 35542186
autoplot(forecast_w1) + autolayer(fitted(forecast_w1),series='Fitted') + ylab("Weekly Sales")+xlab("Weeks")

# Extract fitted values for each model
fitted_w1 <- fitted(fit_w1)
fitted_w2 <- fitted(fit_w2)
fitted_w3 <- fitted(fit_w3)

# Calculate RMSE for each model
rmse_w1 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w1)
rmse_w2 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w2)
rmse_w3 <- calculate_rmse(observed = sales_w_ts, predicted = fitted_w3)

# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_w1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 3564863
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_w2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 3363852
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_w3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 3331847
rmse_exp_sm_w <- rmse_w3
Daily sales

Daily Sales

For daily data, exponential smoothing can be used to forecast short-term trends and seasonal patterns. When applying exponential smoothing to daily data, you need to consider:

  • Seasonality: Daily data often exhibit seasonal patterns, such as weekly cycles (e.g., higher sales on weekends).

  • Holidays and special events: These can cause irregular patterns in daily data that may need to be accounted for.

fit_d1 <- ses(sales_d_ts, alpha = 0.2, initial = 'simple', h=5)
fit_d2 <- ses(sales_d_ts, alpha = 0.6, initial = 'simple', h=5)
fit_d3 <- ses(sales_d_ts, h=5)

plot(sales_d_ts, ylab='Daily Sales', xlab='Days')
lines(fitted(fit_d1), col='blue', type='o')
lines(fitted(fit_d2), col='red', type='o')
lines(fitted(fit_d3), col='green', type='o')

forecast_d1 <- ses(sales_d_ts, h=5)
round(accuracy(forecast_d1),2)
##                    ME    RMSE     MAE      MPE    MAPE MASE ACF1
## Training set 91166.44 1642520 1279475 -3521871 3521898 0.55 0.38
summary(forecast_d1)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = sales_d_ts, h = 5)
## 
##   Smoothing parameters:
##     alpha = 0.0372 
## 
##   Initial states:
##     l = 863041.8751 
## 
##   sigma:  1644102
## 
##      AIC     AICc      BIC 
## 36999.28 36999.30 37014.12 
## 
## Error measures:
##                    ME    RMSE     MAE      MPE    MAPE      MASE      ACF1
## Training set 91166.44 1642520 1279475 -3521871 3521898 0.5526151 0.3755302
## 
## Forecasts:
##           Point Forecast   Lo 80   Hi 80   Lo 95   Hi 95
## 2024.7616        4388896 2281895 6495897 1166516 7611277
## 2024.7644        4388896 2280439 6497354 1164288 7613504
## 2024.7671        4388896 2278983 6498809 1162063 7615730
## 2024.7699        4388896 2277529 6500264 1159838 7617954
## 2024.7726        4388896 2276075 6501717 1157616 7620177
autoplot(forecast_d1) + autolayer(fitted(forecast_d1),series='Fitted') + ylab("Daily Sales")+xlab("Days")

# Extract fitted values for each model
fitted_d1 <- fitted(fit_d1)
fitted_d2 <- fitted(fit_d2)
fitted_d3 <- fitted(fit_d3)

# Calculate RMSE for each model
rmse_d1 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d1)
rmse_d2 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d2)
rmse_d3 <- calculate_rmse(observed = sales_d_ts, predicted = fitted_d3)

# Print RMSE values
cat("RMSE for SES Model 1 (alpha = 0.2):", rmse_d1, "\n")
## RMSE for SES Model 1 (alpha = 0.2): 1714231
cat("RMSE for SES Model 2 (alpha = 0.6):", rmse_d2, "\n")
## RMSE for SES Model 2 (alpha = 0.6): 1803005
cat("RMSE for SES Model 3 (Optimized alpha):", rmse_d3, "\n")
## RMSE for SES Model 3 (Optimized alpha): 1642520
rmse_exp_sm_d <- rmse_d3

GGM + SARMAX

Monthly

# 1.  GGM on sales_m_ts


summary(ggm1)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_m_ts, mt = "base", display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -27544719  -8809035    742251   -337148   7701034  23238738 
## 
## Coefficients:
##           Estimate    Std.Error        Lower        Upper  p-value    
## K    9.208740e+09 2.454321e+09 4.398360e+09 1.401912e+10 7.24e-04 ***
## pc   2.617639e-02 3.002777e-03 2.029106e-02 3.206173e-02 7.64e-10 ***
## qc   2.453426e-01 3.079890e-02 1.849779e-01 3.057074e-01 5.41e-09 ***
## ps   6.782088e-03 1.329165e-03 4.176972e-03 9.387204e-03 1.60e-05 ***
## qs   3.849713e-02 8.943356e-03 2.096848e-02 5.602579e-02 1.56e-04 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  13318300  on  31  degrees of freedom
##  Multiple R-squared:   0.999873  Residual sum of squares:  5.498691e+15
# 2. Get the instantaneous fitted values of the GGM

fitted_ggm_cum <- fitted(ggm1)
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum)


# 3. Auto-SARIMAX on cumsum(sales_m_ts), with xreg = instantaneous GGM

# First, create the cumulative sales
cumulative_sales_m_ts <- cumsum(sales_m_ts)


sarima_m <- auto.arima(
  cumulative_sales_m_ts,
  xreg         = fitted_ggm_instant,
  seasonal     = TRUE,
  stepwise     = FALSE,
  approximation= FALSE
)


# 4. Extract the fitted (cumulative) values of the SARIMAX

fitted_cum_sarima <- fitted(sarima_m)



# 5. Convert SARIMAX fitted cumulative back to instantaneous
# One simple approach:
fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))
# Now fitted_instant_sarima should have the same length as sales_m_ts.

# Turn it into a time series object for neat plotting
fitted_instant_sarima_ts <- ts(
  fitted_instant_sarima,
  start = start(sales_m_ts),
  frequency = frequency(sales_m_ts)
)

# 6. Plot the actual monthly sales and the SARIMAX instantaneous fit

plot(
  sales_m_ts,
  type="o",
  col="black",
  lwd=2,
  main="Actual Monthly Sales vs. SARIMAX Fitted (Instantaneous)",
  xlab="Time",
  ylab="Monthly Sales"
)

lines(
  fitted_instant_sarima_ts,
  col="red",
  lty=2,
  lwd=2
)

legend(
  "bottomright",
  legend = c("Actual Sales", "SARIMAX Fit (Instantaneous)"),
  col    = c("black", "red"),
  lty    = c(1, 2),
  lwd    = c(2, 2)
)

# Calculate RMSE for fitted_instantaneous_ts against sales_m_ts
rmse_mixture_m <- calculate_rmse(observed = sales_m_ts, predicted = fitted_instant_sarima_ts)

# Print the RMSE value
cat("RMSE for Fitted Instantaneous Values (GGM + SARIMAX):", rmse_mixture_m, "\n")
## RMSE for Fitted Instantaneous Values (GGM + SARIMAX): 20987648
resid_mixture_m <- sales_m_ts - fitted_instant_sarima_ts
tsdisplay(resid_mixture_m)

Residuals have autocorrelation at lag 1

Weekly

# 1. GGM on sales_w_ts

summary(ggm1_w)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_w_ts, mt = "base", display = T)
## 
## Residuals:
##       Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -30941134  -8439496   1818273     88704   8469414  21113528 
## 
## Coefficients:
##           Estimate    Std.Error        Lower        Upper  p-value    
## K    6.454454e+09 5.261291e+08 5.423260e+09 7.485648e+09 2.13e-24 ***
## pc   3.542302e-04 2.760784e-05 3.001199e-04 4.083406e-04 6.66e-26 ***
## qc   2.078354e-02 1.326387e-03 1.818387e-02 2.338321e-02 2.16e-33 ***
## ps   1.149074e-02 3.238499e-04 1.085601e-02 1.212547e-02 7.53e-75 ***
## qs   3.110864e-02 2.164583e-03 2.686613e-02 3.535114e-02 5.35e-30 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  12271825  on  150  degrees of freedom
##  Multiple R-squared:   0.999876  Residual sum of squares:  2.258965e+16
# 2. Get the instantaneous fitted values of the GGM

fitted_ggm_cum <- fitted(ggm1_w)                # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum)  # Convert to weekly flow


# 3. Auto-SARIMAX on cumsum(sales_w_ts), with xreg = instantaneous GGM

# Create the cumulative weekly series
cumulative_sales_w_ts <- cumsum(sales_w_ts)

# Fit an auto.arima with the GGM instantaneous series as xreg

sarima_w <- auto.arima(
  cumulative_sales_w_ts,
  xreg         = fitted_ggm_instant,
  seasonal     = TRUE,
  stepwise     = FALSE,
  approximation= FALSE
)

# 4. Extract the fitted (cumulative) values of the SARIMAX

fitted_cum_sarima <- fitted(sarima_w)
# This is the model's in-sample one-step-ahead *cumulative* fit.

# 5. Convert SARIMAX fitted cumulative back to instantaneous

fitted_instant_sarima <- diff(c(0, fitted_cum_sarima))

# Turn it into a time series object 
fitted_instant_sarima_ts <- ts(
  fitted_instant_sarima,
  start     = start(sales_w_ts),
  frequency = frequency(sales_w_ts) 
)

# 6. Plot the actual weekly sales and the SARIMAX instantaneous fit
plot(
  sales_w_ts,
  type = "o",
  col  = "black",
  lwd  = 2,
  main = "Actual Weekly Sales vs. SARIMAX Fitted (Instantaneous)",
  xlab = "Time (Weeks)",
  ylab = "Weekly Sales"
)

lines(
  fitted_instant_sarima_ts,
  col = "red",
  lty = 2,
  lwd = 2
)

legend(
  "bottomright",
  legend = c("Actual Weekly Sales", "SARIMAX Fit (Instantaneous)"),
  col    = c("black", "red"),
  lty    = c(1, 2),
  lwd    = c(2, 2)
)

# Residuals
# Step 1: Extract residuals from the SARIMA model
resid_w <- residuals(sarima_w)

# Step 2: Visualize residuals
# Time series plot of residuals
tsdisplay(resid_w, main = "Residual Diagnostics for SARIMA Model")

# Step 3: Test residuals for stationarity
adf_test <- adf.test(resid_w)
## Warning in adf.test(resid_w): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
  cat("The residuals are stationary.\n")
} else {
  cat("The residuals are not stationary.\n")
}
## The residuals are stationary.
# Step 4: Test residuals for white noise (no autocorrelation)

ljung_box_test <- Box.test(resid_w, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 0.8993948
if (ljung_box_test$p.value > 0.05) {
  cat("The residuals resemble white noise (uncorrelated).\n")
} else {
  cat("The residuals show significant autocorrelation.\n")
}
## The residuals resemble white noise (uncorrelated).

Stationary residuals but with significant correlation

#### RMSE for SARIMAX Predictions ####
rmse_mixture_w <- calculate_rmse(observed = sales_w_ts, predicted = fitted_instant_sarima_ts)

# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_w, "\n")
## RMSE for SARIMAX Predictions: 4567633

Daily

# 1. GGM on sales_d_ts

summary(ggm1_d)
## Call: ( Guseo Guidolin Model )
## 
##   GGM(series = sales_scaled, mt = "base", display = T)
## 
## Residuals:
##      Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.40144 -0.89220  0.18363  0.04369  0.98854  2.48411 
## 
## Coefficients:
##           Estimate    Std.Error         Lower        Upper   p-value    
## K    1.174310e+03 4.487889e+02 294.699957932 2.053920e+03  9.01e-03  **
## pc   2.384956e-05 1.641538e-05  -0.000008324 5.602311e-05  1.47e-01    
## qc   2.113894e-03 1.971344e-04   0.001727518 2.500270e-03  1.64e-25 ***
## ps   1.722612e-03 5.791737e-05   0.001609096 1.836128e-03 5.37e-141 ***
## qs   2.948140e-03 1.218250e-04   0.002709367 3.186912e-03 6.84e-103 ***
## ---
##  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Residual standard error  1.202014  on  1035  degrees of freedom
##  Multiple R-squared:   0.999862  Residual sum of squares:  1495.407
# 2. Get the instantaneous fitted values of the GGM

fitted_ggm_cum <- fitted(ggm1_d)                 # Cumulative fit from GGM
fitted_ggm_instant <- make.instantaneous(fitted_ggm_cum)  # Convert to daily flow

# 3. Auto-SARIMAX on cumsum(sales_d_ts), with xreg = instantaneous GGM

# Create the cumulative daily series
cumulative_sales_d_ts <- cumsum(sales_d_ts)


# first scale data so it can converge

cumulative_sales_d_ts <- scale(cumulative_sales_d_ts)
fitted_ggm_instant <- scale(fitted_ggm_instant)

# Scale the data
cumulative_sales_center <- attr(cumulative_sales_d_ts, "scaled:center")
cumulative_sales_scale <- attr(cumulative_sales_d_ts, "scaled:scale")
ggm_instant_center <- attr(fitted_ggm_instant, "scaled:center")
ggm_instant_scale <- attr(fitted_ggm_instant, "scaled:scale")


sarima_d <- auto.arima(
  cumulative_sales_d_ts,
  xreg = fitted_ggm_instant,
  stepwise = TRUE,
  approximation = TRUE
)



# 4. Extract the fitted (cumulative) values of the SARIMAX

# Fitted cumulative values (scaled)
fitted_cum_sarima <- fitted(sarima_d)

# Scale back to original cumulative values
fitted_cum_sarima_original <- fitted_cum_sarima * cumulative_sales_scale + cumulative_sales_center

# Convert cumulative fitted to instantaneous fitted (original units)
fitted_instant_sarima_original <- diff(c(0, fitted_cum_sarima_original))

# Create time series object for instantaneous fitted values
fitted_instant_sarima_original_ts <- ts(
  fitted_instant_sarima_original,
  start = start(sales_d_ts),
  frequency = frequency(sales_d_ts)
)

# Plot the results
plot(
  sales_d_ts,
  type = "o",
  col  = "black",
  lwd  = 2,
  main = "Actual Daily Sales vs. SARIMAX Fitted (Instantaneous)",
  xlab = "Time (Days)",
  ylab = "Daily Sales"
)

lines(
  fitted_instant_sarima_original_ts,
  col = "red",
  lty = 2,
  lwd = 2
)

legend(
  "bottomright",
  legend = c("Actual Daily Sales", "SARIMAX Fit (Instantaneous)"),
  col    = c("black", "red"),
  lty    = c(1, 2),
  lwd    = c(2, 2)
)

#### Residuals-----------------------
# Extract residuals from the SARIMA model
resid_d <- residuals(sarima_d)

# Visualize residuals
tsdisplay(resid_d, main = "Residual Diagnostics for SARIMA Model")

# Test residuals for stationarity
adf_test <- adf.test(resid_d)
## Warning in adf.test(resid_d): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test$p.value < 0.05) {
  cat("The residuals are stationary.\n")
} else {
  cat("The residuals are not stationary.\n")
}
## The residuals are stationary.
# Test residuals for white noise (no autocorrelation)
ljung_box_test <- Box.test(resid_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 0.5910185
if (ljung_box_test$p.value > 0.05) {
  cat("The residuals resemble white noise (uncorrelated).\n")
} else {
  cat("The residuals show significant autocorrelation.\n")
}
## The residuals resemble white noise (uncorrelated).
#### RMSE for SARIMAX Predictions ####
rmse_mixture_d <- calculate_rmse(observed = sales_d_ts, predicted = fitted_instant_sarima_original_ts)

# Print RMSE for SARIMAX
cat("RMSE for SARIMAX Predictions:", rmse_mixture_d, "\n")
## RMSE for SARIMAX Predictions: 1629664

Prophet

This model was introduced by Facebook (S. J. Taylor & Letham, 2018), originally for forecasting daily data with weekly and yearly seasonality, plus holiday effects. It was later extended to cover more types of seasonal data. It works best with time series that have strong seasonality and several seasons of historical data.

Prophet can be considered a nonlinear regression model (Chapter 7), of the form yt=g(t)+s(t)+h(t)+εt, where g(t) describes a piecewise-linear trend (or “growth term”), s(t) describes the various seasonal patterns, h(t) captures the holiday effects, and εt is a white noise error term.

  • The knots (or changepoints) for the piecewise-linear trend are automatically selected if not explicitly specified. Optionally, a logistic function can be used to set an upper bound on the trend.

  • The seasonal component consists of Fourier terms of the relevant periods. By default, order 10 is used for annual seasonality and order 3 is used for weekly seasonality.

  • Holiday effects are added as simple dummy variables.

  • The model is estimated using a Bayesian approach to allow for automatic selection of the changepoints and other model characteristics.

library(prophet)

The input to Prophet is always a dataframe with two columns: ds and y . The ds (datestamp) column should be of a format, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. The y column must be numeric, and represents the measurement we wish to forecast.

Monthly

# sales montly
ggplot(df_merged_m, aes(x=month, y=sales_m)) +
  geom_line() + ggtitle("Monthly Sales of Restaurant")

head(df_merged_m)
#Prophet model
# model with no seasonality
df_prophet_m <- df_merged_m[1:2]
head(df_prophet_m)
colnames(df_prophet_m) = c("ds", "y")
df_prophet_m$y <- exp(df_prophet_m$y)
prophet_sales_m <- prophet(df_prophet_m)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
head(df_prophet_m)
# Step 2: Create a future dataframe for the next 14 months
future_sales_m <- make_future_dataframe(
  prophet_sales_m,
  periods = 14,           # Forecast for 14 months
  freq = 'month',         # Monthly frequency
  include_history = TRUE  # Include historical data in the future dataframe
)
tail(future_sales_m)
forecast_sales_m <- predict(prophet_sales_m, future_sales_m)
tail(forecast_sales_m[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_m, forecast_sales_m)

prophet_plot_components(prophet_sales_m, forecast_sales_m)

dyplot.prophet(prophet_sales_m, forecast_sales_m)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the prophet package.
##   Please report the issue at <https://github.com/facebook/prophet/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Use the original dataframe to get the fitted values
fitted_values <- predict(prophet_sales_m, df_prophet_m)

# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y <- fitted_values$yhat

# Calculate RMSE

actual_y <- df_prophet_m$y  # Actual sales values
rmse_prophet_m <- calculate_rmse(observed = actual_y, predicted = fitted_y)

# Print RMSE
cat("RMSE for Prophet Fitted Values:", rmse_prophet_m, "\n")
## RMSE for Prophet Fitted Values: 16786939

Residuals for prophet

# Calculate Residuals
residuals_prophet <- actual_y - fitted_y  # Residuals = Actual - Fitted

#  Visualize Residuals using tsdisplay

tsdisplay(residuals_prophet, main = "Residual Diagnostics for Prophet Model")

#  Perform ADF Test for Stationarity

adf_test <- adf.test(residuals_prophet)
cat("ADF Test p-value:", adf_test$p.value, "\n")
## ADF Test p-value: 0.3278912
if (adf_test$p.value < 0.05) {
  cat("Residuals are stationary (reject H0).\n")
} else {
  cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are not stationary (fail to reject H0).
#  Perform Serial Correlation Test
ljung_box_test <- Box.test(residuals_prophet, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test$p.value, "\n")
## Ljung-Box Test p-value: 6.527107e-10
if (ljung_box_test$p.value > 0.05) {
  cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
  cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.

Weekly

ggplot(df_merged_w, aes(x=week, y=sales_w)) +
  geom_line() + ggtitle("Weekly Sales of Restaurant")

head(df_merged_w)
#Prophet model
# model with no seasonality
df_prophet_w <- df_merged_w[1:2]
colnames(df_prophet_w) = c("ds", "y")
df_prophet_w$y <- exp(df_prophet_w$y)
df_prophet_w
prophet_sales_w <- prophet(df_prophet_w)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

Predictions are made on a dataframe with a column ds containing the dates for which predictions are to be made. The make_future_dataframe function takes the model object and a number of periods to forecast and produces a suitable dataframe. By default it will also include the historical dates so we can evaluate in-sample fit.

future_sales_w <- make_future_dataframe(prophet_sales_w, 
                                        periods = 52,
                                        freq = 'week',
                                        include_history = T)
tail(future_sales_w)

As with most modeling procedures in R, we use the generic predict function to get our forecast. The forecast object is a dataframe with a column yhat containing the forecast. It has additional columns for uncertainty intervals and seasonal components.

# R
forecast_sales_w <- predict(prophet_sales_w, future_sales_w)
tail(forecast_sales_w[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_w, forecast_sales_w)

You can use the prophet_plot_components function to see the forecast broken down into trend, weekly seasonality, and yearly seasonality.

prophet_plot_components(prophet_sales_w, forecast_sales_w)

dyplot.prophet(prophet_sales_w, forecast_sales_w)
# Use the original dataset to get fitted values
fitted_values_w <- predict(prophet_sales_w, df_prophet_w)

# Extract the fitted values (column 'yhat' contains the fitted values)
fitted_y_w <- fitted_values_w$yhat

# Ensure alignment between actual values (y) and fitted values (yhat)
actual_y_w <- df_prophet_w$y  # Actual weekly sales values

# Calculate RMSE for weekly data
rmse_prophet_w <- calculate_rmse(observed = actual_y_w, predicted = fitted_y_w)

# Print RMSE
cat("RMSE for Prophet Fitted Values (Weekly):", rmse_prophet_w, "\n")
## RMSE for Prophet Fitted Values (Weekly): 3246088
# Calculate Residuals
residuals_prophet_w <- actual_y_w - fitted_y_w  # Residuals = Actual - Fitted

# Visualize Residuals using tsdisplay
tsdisplay(residuals_prophet_w, main = "Residual Diagnostics for Weekly Prophet Model")

# Perform ADF Test for Stationarity

adf_test_w <- adf.test(residuals_prophet_w)
cat("ADF Test p-value:", adf_test_w$p.value, "\n")
## ADF Test p-value: 0.0238759
if (adf_test_w$p.value < 0.05) {
  cat("Residuals are stationary (reject H0).\n")
} else {
  cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_w <- Box.test(residuals_prophet_w, lag = 10, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_w$p.value, "\n")
## Ljung-Box Test p-value: 5.447864e-11
if (ljung_box_test_w$p.value > 0.05) {
  cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
  cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.

Daily

head(sales_d_ts)
## Time Series:
## Start = c(2021, 334) 
## End = c(2021, 339) 
## Frequency = 365 
## [1] 1493701  673701 1205301 1340901 1343701  360901
plot(sales_d_ts)

sales_d_values <- as.numeric(sales_d_ts)   # Extract numeric values

df_prophet_d <- data.frame(
  ds = df_merged_d$date,  # Dates
  y = sales_d_values   # Sales values
)
#Prophet model


#prophet_sales_d <- prophet(df_prophet, weekly.seasonality = TRUE)
prophet_sales_d <- prophet(df_prophet_d)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future_sales_d <- make_future_dataframe(prophet_sales_d,
                                        periods = 120,
                                        freq = 'day',
                                        include_history = T)
tail(future_sales_d)
forecast_sales_d <- predict(prophet_sales_d, future_sales_d)
tail(forecast_sales_d[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(prophet_sales_d, forecast_sales_d)

prophet_plot_components(prophet_sales_d, forecast_sales_d)

dyplot.prophet(prophet_sales_d, forecast_sales_d)
# Extract fitted values for RMSE calculation
fitted_values_d <- predict(prophet_sales_d, df_prophet_d)

# Extract fitted values (column 'yhat')
fitted_y_d <- fitted_values_d$yhat
actual_y_d <- df_prophet_d$y  # Actual sales values

# Step 8: Calculate RMSE
rmse_prophet_d <- calculate_rmse(observed = actual_y_d, predicted = fitted_y_d)

# Print RMSE
cat("RMSE for Prophet Fitted Values (Daily):", rmse_prophet_d, "\n")
## RMSE for Prophet Fitted Values (Daily): 1071375
# Calculate Residuals
residuals_prophet_d <- actual_y_d - fitted_y_d  # Residuals = Actual - Fitted

# Visualize Residuals using tsdisplay

tsdisplay(residuals_prophet_d, main = "Residual Diagnostics for Daily Prophet Model")

# Perform ADF Test for Stationarity

adf_test_d <- adf.test(residuals_prophet_d)
## Warning in adf.test(residuals_prophet_d): p-value smaller than printed p-value
cat("ADF Test p-value:", adf_test_d$p.value, "\n")
## ADF Test p-value: 0.01
if (adf_test_d$p.value < 0.05) {
  cat("Residuals are stationary (reject H0).\n")
} else {
  cat("Residuals are not stationary (fail to reject H0).\n")
}
## Residuals are stationary (reject H0).
# Perform Serial Correlation Test
ljung_box_test_d <- Box.test(residuals_prophet_d, lag = 20, type = "Ljung-Box")
cat("Ljung-Box Test p-value:", ljung_box_test_d$p.value, "\n")
## Ljung-Box Test p-value: 0
if (ljung_box_test_d$p.value > 0.05) {
  cat("Residuals resemble white noise (no significant autocorrelation).\n")
} else {
  cat("Residuals show significant autocorrelation.\n")
}
## Residuals show significant autocorrelation.
rmse_list <- c(rmse_ols_m, rmse_ols_w, rmse_ols_d,
               rmse_bm_m, rmse_bm_w, rmse_bm_d,
               rmse_ggm1, rmse_ggm_w, rmse_ggm_d,
               rmse_hw2,
               rmse_auto_arima, rmse_auto_arima_w, rmse_auto_arima_d,
               rmse_sarima_d,
               rmse_sarimax_d,
               rmse_exp_sm_m, rmse_exp_sm_w, rmse_exp_sm_m,
               rmse_mixture_m, rmse_mixture_w, rmse_mixture_d,
               rmse_prophet_m, rmse_prophet_w, rmse_prophet_d
               )
rmse_list
##  [1] 13229295  4000370  1421301 18498870  4542019  1651896 11759505  3453199
##  [9]  1600510 13169921 15118942  3328293  1094980  1094980  1146332 16189623
## [17]  3331847 16189623 20987648  4567633  1629664 16786939  3246088  1071375

Evaluation of all models

# Initialize an empty data frame for RMSE values
rmse_table <- data.frame(
  Model = character(),
  Monthly = numeric(),
  Weekly = numeric(),
  Daily = numeric(),
  stringsAsFactors = FALSE
)

# Monthly RMSE values
rmse_monthly <- c(
  "OLS" = rmse_ols_m,
  "Bass_Model" = rmse_bm_m,
  "GGM" = rmse_ggm1,
  "Holt_Winters" = rmse_hw2,
  "Arima" = rmse_auto_arima,
  "Exp_Smooth" = rmse_exp_sm_m,
  "GGM+SARIMA" = rmse_mixture_m,
  "Prophet" = rmse_prophet_m
)

# Weekly RMSE values
rmse_weekly <- c(
  "OLS" = rmse_ols_w,
  "Bass_Model" = rmse_bm_w,
  "GGM" = rmse_ggm_w,
  "Holt_Winters" = NaN,
  "Arima" = rmse_auto_arima_w,
  "Exp_Smooth" = rmse_exp_sm_w,
  "GGM+SARIMA" = rmse_mixture_w,
  "Prophet" = rmse_prophet_w
)

# Daily RMSE values
rmse_daily <- c(
  "OLS" = rmse_ols_d,
  "Bass_Model" = rmse_bm_d,
  "GGM" = rmse_ggm_d,
  "Holt_Winters" = NaN,
  "Arima" = rmse_auto_arima_d,
  "Exp_Smooth" = rmse_exp_sm_d,
  "GGM+SARIMA" = rmse_mixture_d,
  "Prophet" = rmse_prophet_d
)

# Combine RMSE values into a table
for (model_name in names(rmse_monthly)) {
  rmse_table <- rbind(rmse_table, data.frame(
    Model = model_name,
    Monthly = rmse_monthly[model_name],
    Weekly = rmse_weekly[model_name],
    Daily = rmse_daily[model_name]
  ))
}

# View the RMSE table
print(rmse_table)
##                     Model  Monthly  Weekly   Daily
## OLS                   OLS 13229295 4000370 1421301
## Bass_Model     Bass_Model 18498870 4542019 1651896
## GGM                   GGM 11759505 3453199 1600510
## Holt_Winters Holt_Winters 13169921     NaN     NaN
## Arima               Arima 15118942 3328293 1094980
## Exp_Smooth     Exp_Smooth 16189623 3331847 1642520
## GGM+SARIMA     GGM+SARIMA 20987648 4567633 1629664
## Prophet           Prophet 16786939 3246088 1071375

Best models are:

  • Monthly: GGM
  • Weekly: Prophet
  • Daily: Prophet

Evaluation of best models on Test Set

Import test set

# target variable
test_sales_df <- read_excel("data/sales/test_data.xlsx")
head(test_sales_df)
df_sales_m_test <- test_sales_df %>%
  mutate(month = floor_date(date, "month")) %>% # Extract month
  group_by(month) %>%
  summarise(sales_m = sum(sales_cop), bar_m = sum(bar), food_m = sum(food)
            )     # Summing values

head(df_sales_m_test)
## sales weekly
df_sales_w_test <- test_sales_df %>%
  mutate(week = floor_date(date, "week")) %>% # Extract month
  group_by(week) %>%
  summarise(sales_w = sum(sales_cop), bar_w = sum(bar), food_w = sum(food))     # Summing values

head(df_sales_w_test)

Forecast vs Actual

Montly

GGM

# Length of historical data
N <- length(sales_m_ts)

# Time index for the next 38 periods
future_times <- matrix((N + 1):(N + 38), ncol = 1)

# Predict cumulative sales for the next 38 periods
ggm_future_cum <- predict(ggm1, newx = future_times)
# Convert cumulative sales forecast to instantaneous sales
ggm_future_inst <- make.instantaneous(ggm_future_cum)[-1]
# Get the end time of sales_m_ts
end_time <- tsp(sales_m_ts)[2]  # End time of original series

# Start the forecast at the next period
forecast_start <- end_time + 1 / frequency(sales_m_ts)

# Create a time series object for the forecast
ggm_future_inst_ts <- ts(
  ggm_future_inst,
  start = forecast_start,
  frequency = frequency(sales_m_ts)
)

# Step 5: Plot the forecast with aligned index
plot(
  sales_m_ts,
  type = "o",
  col = "black",
  lwd = 2,
  main = "Original, Fitted, and Forecasted Instantaneous Sales",
  xlab = "Time",
  ylab = "Monthly Sales",
  xlim = c(start(sales_m_ts)[1], forecast_start + 38 / frequency(sales_m_ts))
)

lines(
  pred.inst_ggm_m,
  col = "red",
  lty = 2,
  lwd = 2
)

lines(
  ggm_future_inst_ts,
  col = "blue",
  lty = 2,
  lwd = 2
)

legend(
  "bottomright",
  legend = c("Original Sales", "Fitted Values", "Forecast"),
  col = c("black", "red", "blue"),
  lty = c(1, 2, 2),
  lwd = c(2, 2, 2)
)

The forecast of the GGM looks like the restaurant is loosing grip, actually restaurants can do well for way longer than the estimated period by the GGM. So we take this as a downside scenario forecast. But lets try to forecast a baseline scenario, with the second best model: the Holt-Winters.

Holt-Winters: Multiplicative

# Step 1: Re-Fit the Holt-Winters model
hw2_m <- hw(sales_m_ts, seasonal = "multiplicative")

# Step 2: Generate the forecast for the next 24 months
forecast_hw <- forecast(hw2_m, h = 24)

# Step 3: Use autoplot to plot the original values and the forecast
autoplot(forecast_hw) +
  ggtitle("Holt-Winters Forecast and Original Sales") +
  xlab("Time") +
  ylab("Monthly Sales") +
  theme_minimal()

# Compare actual vs forecasted
actual_sales_m <- df_sales_m_test$sales_m
forecasted_hw <- forecast_hw$mean[1:2]
forecasted_ggm <- ggm_future_inst_ts[1:2]
rmse_test_hw <-calculate_rmse(actual_sales_m, forecasted_hw)
rmse_test_ggm <- calculate_rmse(actual_sales_m, forecasted_ggm)
cat("Error Holt-Winters:", rmse_test_hw, "\n")
## Error Holt-Winters: 33822273
cat("Error GGM:", rmse_test_ggm, "\n")
## Error GGM: 17357973

The GGM error is lower, we can do a combination (smoothing) of both scenarios for the final forecast

# Combine the actual and forecasted data into a data frame
plot_data <- data.frame(
  Period = c("Last Period 1", "Last Period 2"),
  Actual = actual_sales_m,
  HoltWinters = forecasted_hw,
  GGM = forecasted_ggm
)

plot_data_melt <- melt(plot_data, id.vars = "Period", variable.name = "Type", value.name = "Sales")

# Create a plot
ggplot(plot_data_melt, aes(x = Period, y = Sales, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Actual vs Forecasted Sales",
       x = "Period",
       y = "Sales",
       fill = "Type") +
  theme_minimal()

Weekly

Prophet

# Filter forecast for November and December 2024
forecast_nov_dec_w <- forecast_sales_w %>%
  filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05")`.
## Caused by warning in `check_tzones()`:
## ! 'tzone' attributes are inconsistent
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Plot the actual sales with black dots
plot(as.Date(df_sales_w_test$week), df_sales_w_test$sales_w,
     type = "p",                             # Points for actual sales
     pch = 16,                               # Solid circles
     col = "black",                          # Black color for points
     xlab = "Date",                          # X-axis label
     ylab = "Sales",                         # Y-axis label
     main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)")  # Title

# Add the forecast line in red
lines(as.Date(forecast_nov_dec_w$ds), forecast_nov_dec_w$yhat,
      col = "red",                          # Red color for the line
      lwd = 2)                              # Wider line

# Add a legend in the top right
legend("topright",
       legend = c("Actual Sales", "Forecasted Sales"),
       col = c("black", "red"),             # Colors matching the plot
       pch = c(16, NA),                     # Solid dot for actual sales, none for line
       lty = c(NA, 1),                      # Line for forecasted sales, none for dot
       lwd = c(NA, 2))                      # Line width for the forecast

Daily

Prophet

# Filter forecast for November and December 2024
forecast_nov_dec_d <- forecast_sales_d %>%
  filter(ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05"))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `ds >= as.Date("2024-11-01") & ds <= as.Date("2025-01-05")`.
## Caused by warning in `check_tzones()`:
## ! 'tzone' attributes are inconsistent
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Plot the actual sales with black dots
plot(as.Date(test_sales_df$date), test_sales_df$sales_cop,
     type = "p",                             # Points for actual sales
     pch = 16,                               # Solid circles
     col = "black",                          # Black color for points
     xlab = "Date",                          # X-axis label
     ylab = "Sales",                         # Y-axis label
     main = "Actual Sales vs Forecasted Sales (Nov-Dec 2024)")  # Title

# Add the forecast line in red
lines(as.Date(forecast_nov_dec_d$ds), forecast_nov_dec_d$yhat,
      col = "red",                          # Red color for the line
      lwd = 2)                              # Wider line

# Add a legend in the top right
legend("topright",
       legend = c("Actual Sales", "Forecasted Sales"),
       col = c("black", "red"),             # Colors matching the plot
       pch = c(16, NA),                     # Solid dot for actual sales, none for line
       lty = c(NA, 1),                      # Line for forecasted sales, none for dot
       lwd = c(NA, 2))                      # Line width for the forecast

Forecast with Best Models

Finally, we want to use the long term forecast, so we can use the monthly combination of models for that. ### Montly

long_term_forecasted_hw <- forecast_hw$mean
long_term_forecasted_ggm <- ggm_future_inst_ts
# Define the date ranges for both series
dates_hw <- seq(as.Date("2024-11-01"), as.Date("2026-10-01"), by = "month")
dates_ggm <- seq(as.Date("2024-11-01"), as.Date("2027-11-01"), by = "month")

# Create data frames for the two series
df_hw <- data.frame(Date = dates_hw, Value = long_term_forecasted_hw)
df_ggm <- data.frame(Date = dates_ggm, Value = long_term_forecasted_ggm)

# Merge the two series on the common dates
df_merged <- merge(df_hw, df_ggm, by = "Date", suffixes = c("_hw", "_ggm"))

# Calculate the mean of both series where data is available
df_merged$Mean <- rowMeans(df_merged[, c("Value_hw", "Value_ggm")], na.rm = TRUE)

# Create the new time series
long_term_forecast <- data.frame(Date = df_merged$Date, Mean = df_merged$Mean)
plot(long_term_forecast)